# PANDAS LEARNING NOTES

> CREATE DATE : 01.29.2023

> AUTHOR : BURAK YILMAZ



## TABLE OF CONTENTS

1. READING FLAT FILES
1. DATA PROFILING BASICS
1. FILTERING
1. HANDLING MISSING VALUES
1. HANDLING DUPLICATE VALUES
1. JOIN
1. GROUP BY AND AGGREGATE FUNCTIONS
1. PIVOT
1. CONCATENATE
1. STRING FUNCTIONS
1. DATE FUNCTIONS
1. DATAFRAME WRITE FUNCTIONS


In [6]:
# import statements 

import pandas as pd
import numpy as np 
import xlrd

### 1) READING FLAT FILES

In [5]:
# CSV READ

sep = ","
encoding = 'latin-1'

df_csv = pd.read_csv(r"D:\98_TELE2 CLOUD\Tele2 Cloud\96_DATASETS\02_iris.csv",
                     sep = sep,
                     encoding = encoding,  # use this if data contains special chars
                     index_col = None,  # default is none. use it if data contains ID column
                     )

# use r before path if path contains special chars
# preview the data : print(df_csv.head())
# check more methods on here : https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html



In [9]:
# EXCEL READ
df_excel = pd.read_excel(r"D:\98_TELE2 CLOUD\Tele2 Cloud\96_DATASETS\00_clustering_retail_data.xlsx",
                         sheet_name=0,  # default is 0. Specify if data is on another sheet
                         header=0,  # default is 0. Use it if headers are at different row
                         index_col=None  # default is none. use it if data contains ID column
                         )

# use r before path if path contains special chars
# preview the data: print(df_excel.head())
# check more methods on here : https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html

### 2) DATA PROFILING BASICS

In [11]:
# preview the top and bottom rows
display(df_excel.head(5))
display(df_excel.tail(5))


Unnamed: 0,Store Code,Store Name,GeoName ID,Country,City,Latitude - Longtitude,Latitude,Longtitude,Store Capacity,Trendy Customer %
0,1,Store 1,3432043,Argentina,La Plata,"-34.92145, -57.95453",-34.92145,-57.95453,Medium Stores,0.49
1,3,Store 3,3835994,Argentina,Santa Rosa,"-36.61667, -64.28333",-36.61667,-64.28333,Small Stores,0.26
2,5,Store 5,3836277,Argentina,Santa Fe,"-31.64881, -60.70868",-31.64881,-60.70868,Small Stores,0.45
3,9,Store 9,3836873,Argentina,San Miguel de Tucuman,"-26.82414, -65.2226",-26.82414,-65.2226,Big Stores,0.28
4,14,Store 14,2160517,Australia,Launceston,"-41.43876, 147.13467",-41.43876,147.13467,Big Stores,0.36


Unnamed: 0,Store Code,Store Name,GeoName ID,Country,City,Latitude - Longtitude,Latitude,Longtitude,Store Capacity,Trendy Customer %
995,987,Store 987,5406567,United States,Visalia,"36.33023, -119.29206",36.33023,-119.29206,Small Stores,0.87
996,993,Store 993,4049979,United States,Birmingham,"33.52066, -86.80249",33.52066,-86.80249,Small Stores,0.91
997,995,Store 995,4280539,United States,Topeka,"39.04833, -95.67804",39.04833,-95.67804,Small Stores,0.87
998,997,Store 997,4471025,United States,High Point,"35.95569, -80.00532",35.95569,-80.00532,Small Stores,0.82
999,1000,Store 1000,5097529,United States,Edison,"40.51872, -74.4121",40.51872,-74.4121,Small Stores,0.86


In [15]:
# get the column names of the dataset
display(df_excel.columns)

# get the column names. More Readable
for cols in df_excel.columns:
    print(cols)


Index(['Store Code', 'Store Name', 'GeoName ID', 'Country', 'City',
       'Latitude - Longtitude', 'Latitude', 'Longtitude', 'Store Capacity',
       'Trendy Customer %'],
      dtype='object')

Store Code
Store Name
GeoName ID
Country
City
Latitude - Longtitude
Latitude
Longtitude
Store Capacity
Trendy Customer %


In [14]:
# check the datatypes of the columns
display(df_excel.dtypes)


Store Code                 int64
Store Name                object
GeoName ID                 int64
Country                   object
City                      object
Latitude - Longtitude     object
Latitude                 float64
Longtitude               float64
Store Capacity            object
Trendy Customer %        float64
dtype: object

In [16]:
# get the summary stats of the data
display(df_excel.describe())

Unnamed: 0,Store Code,GeoName ID,Latitude,Longtitude,Trendy Customer %
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,2865912.0,30.354904,28.097894,0.64324
std,288.819436,1954889.0,20.934813,79.28129,0.244394
min,1.0,255274.0,-45.87416,-123.35155,0.15
25%,250.75,1784974.0,24.646237,-38.632435,0.44
50%,500.5,2522670.0,34.78666,23.66903,0.7
75%,750.25,3468391.0,43.374243,104.6293,0.86
max,1000.0,12501200.0,65.01236,176.16667,0.95


In [17]:

# get the summary of only numeric columns
df_excel.select_dtypes('number').describe()

Unnamed: 0,Store Code,GeoName ID,Latitude,Longtitude,Trendy Customer %
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,2865912.0,30.354904,28.097894,0.64324
std,288.819436,1954889.0,20.934813,79.28129,0.244394
min,1.0,255274.0,-45.87416,-123.35155,0.15
25%,250.75,1784974.0,24.646237,-38.632435,0.44
50%,500.5,2522670.0,34.78666,23.66903,0.7
75%,750.25,3468391.0,43.374243,104.6293,0.86
max,1000.0,12501200.0,65.01236,176.16667,0.95


In [19]:
# get the unique values of string objects
display(df_excel["Country"].unique())

array(['Argentina', 'Australia', 'Austria', 'Belgium',
       'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Canada', 'China',
       'Croatia', 'Czech Republic', 'Denmark', 'Egypt', 'Finland',
       'France', 'Germany', 'Greece', 'Hungary', 'India', 'Ireland',
       'Italy', 'Japan', 'Kazakhstan', 'Latvia', 'Mexico', 'Morocco',
       'Netherlands', 'New Zealand', 'Norway', 'Poland', 'Portugal',
       'Romania', 'Serbia', 'Slovakia', 'Spain', 'Sweden', 'Switzerland',
       'Turkey', 'United Kingdom', 'United States', 'Iceland', 'Slovenia'],
      dtype=object)

In [21]:
# sort the DataFrame columns
df_excel.sort_values(by="Country", ascending=True, inplace=True)
display(df_excel)

Unnamed: 0,Store Code,Store Name,GeoName ID,Country,City,Latitude - Longtitude,Latitude,Longtitude,Store Capacity,Trendy Customer %
0,1,Store 1,3432043,Argentina,La Plata,"-34.92145, -57.95453",-34.92145,-57.95453,Medium Stores,0.49
2,5,Store 5,3836277,Argentina,Santa Fe,"-31.64881, -60.70868",-31.64881,-60.70868,Small Stores,0.45
1,3,Store 3,3835994,Argentina,Santa Rosa,"-36.61667, -64.28333",-36.61667,-64.28333,Small Stores,0.26
300,2,Store 2,3848950,Argentina,La Rioja,"-29.41105, -66.85067",-29.41105,-66.85067,Medium Stores,0.55
301,4,Store 4,3436043,Argentina,Berazategui,"-34.76531, -58.21278",-34.76531,-58.21278,Big Stores,0.66
...,...,...,...,...,...,...,...,...,...,...
978,949,Store 949,5074472,United States,Omaha,"41.25626, -95.94043",41.25626,-95.94043,Big Stores,0.83
965,920,Store 920,4273837,United States,Kansas City,"39.11417, -94.62746",39.11417,-94.62746,Small Stores,0.82
964,919,Store 919,4679195,United States,Carrollton,"32.95373, -96.89028",32.95373,-96.89028,Small Stores,0.86
558,931,Store 931,4409896,United States,Springfield,"37.21533, -93.29824",37.21533,-93.29824,Small Stores,0.58


In [23]:
# multiple column sorting

df_excel.sort_values(by=["Store Code", "Country"], ascending=[True, False], inplace=True)
display(df_excel)

Unnamed: 0,Store Code,Store Name,GeoName ID,Country,City,Latitude - Longtitude,Latitude,Longtitude,Store Capacity,Trendy Customer %
0,1,Store 1,3432043,Argentina,La Plata,"-34.92145, -57.95453",-34.92145,-57.95453,Medium Stores,0.49
300,2,Store 2,3848950,Argentina,La Rioja,"-29.41105, -66.85067",-29.41105,-66.85067,Medium Stores,0.55
1,3,Store 3,3835994,Argentina,Santa Rosa,"-36.61667, -64.28333",-36.61667,-64.28333,Small Stores,0.26
301,4,Store 4,3436043,Argentina,Berazategui,"-34.76531, -58.21278",-34.76531,-58.21278,Big Stores,0.66
2,5,Store 5,3836277,Argentina,Santa Fe,"-31.64881, -60.70868",-31.64881,-60.70868,Small Stores,0.45
...,...,...,...,...,...,...,...,...,...,...
299,996,Store 996,4281730,United States,Wichita,"37.69224, -97.33754",37.69224,-97.33754,Medium Stores,0.17
998,997,Store 997,4471025,United States,High Point,"35.95569, -80.00532",35.95569,-80.00532,Small Stores,0.82
578,998,Store 998,4684888,United States,Dallas,"32.78306, -96.80667",32.78306,-96.80667,Big Stores,0.76
579,999,Store 999,4951305,United States,South Boston,"42.33343, -71.04949",42.33343,-71.04949,Medium Stores,0.77


In [53]:
# check the correlation between columns

df_excel.corr()   # Note: The corr() method ignores "not numeric" columns.

  df_excel.corr()   # Note: The corr() method ignores "not numeric" columns.


Unnamed: 0,Store Code,GeoName ID,Latitude,Longtitude,Trendy Customer %
Store Code,1.0,0.139071,0.398122,-0.327448,-0.026173
GeoName ID,0.139071,1.0,-0.010939,-0.477311,-0.01783
Latitude,0.398122,-0.010939,1.0,0.038019,-0.012816
Longtitude,-0.327448,-0.477311,0.038019,1.0,0.011052
Trendy Customer %,-0.026173,-0.01783,-0.012816,0.011052,1.0


### 3) FILTERING

In [27]:
# create filter variable for readability
first_filter = (df_excel["Trendy Customer %"] > 0.5) & (df_excel["Country"] == "Sweden")

# To get the entire dataframe with filtered values :
df_filtered1 = df_excel[first_filter]

display(df_filtered1)

Unnamed: 0,Store Code,Store Name,GeoName ID,Country,City,Latitude - Longtitude,Latitude,Longtitude,Store Capacity,Trendy Customer %
521,817,Store 817,602150,Sweden,Umea,"63.82842, 20.25972",63.82842,20.25972,Small Stores,0.78
929,819,Store 819,2664454,Sweden,Vaesteras,"59.61617, 16.55276",59.61617,16.55276,Small Stores,0.85
930,820,Store 820,2694762,Sweden,Linkoeping,"58.41086, 15.62157",58.41086,15.62157,Big Stores,0.94
522,821,Store 821,2676209,Sweden,Soedermalm,"59.31278, 18.07577",59.31278,18.07577,Small Stores,0.62
523,822,Store 822,2673730,Sweden,Stockholm,"59.32938, 18.06871",59.32938,18.06871,Big Stores,0.52
931,823,Store 823,2675408,Sweden,Sollentuna,"59.42804, 17.95093",59.42804,17.95093,Big Stores,0.83
524,825,Store 825,2711537,Sweden,Goeteborg,"57.70716, 11.96679",57.70716,11.96679,Big Stores,0.56
525,826,Store 826,2706767,Sweden,Helsingborg,"56.04673, 12.69437",56.04673,12.69437,Small Stores,0.52


In [29]:
# create filter variable for readability
first_filter = (df_excel["Trendy Customer %"] > 0.5) & (df_excel["Country"] == "Sweden")

# To get specific columns with filtered values:
df_filtered2 = df_excel.loc[first_filter, ["Country", "City", "Trendy Customer %"]]

display(df_filtered2)

Unnamed: 0,Country,City,Trendy Customer %
521,Sweden,Umea,0.78
929,Sweden,Vaesteras,0.85
930,Sweden,Linkoeping,0.94
522,Sweden,Soedermalm,0.62
523,Sweden,Stockholm,0.52
931,Sweden,Sollentuna,0.83
524,Sweden,Goeteborg,0.56
525,Sweden,Helsingborg,0.52


In [31]:
# filter with (is in) method :

 # 1: create the list you want to check
nordic_countries = ["Sweden", "Norway", "Denmark"]    

# 2: create the filter variable
nordic_filter = df_excel["Country"].isin(nordic_countries)

 # 3: apply the filter
df_filtered3 = df_excel[nordic_filter]


display(df_filtered3.head())


Unnamed: 0,Store Code,Store Name,GeoName ID,Country,City,Latitude - Longtitude,Latitude,Longtitude,Store Capacity,Trendy Customer %
388,338,Store 338,2618425,Denmark,Copenhagen,"55.67594, 12.56553",55.67594,12.56553,Big Stores,0.56
389,339,Store 339,2624886,Denmark,Aalborg,"57.048, 9.9187",57.048,9.9187,Small Stores,0.79
727,340,Store 340,2624652,Denmark,Arhus,"56.15674, 10.21076",56.15674,10.21076,Small Stores,0.9
102,341,Store 341,2615876,Denmark,Odense,"55.39594, 10.38831",55.39594,10.38831,Small Stores,0.5
902,741,Store 741,3161732,Norway,Bergen,"60.39299, 5.32415",60.39299,5.32415,Small Stores,0.89


In [None]:
# add loc method

In [None]:
# add iloc method

In [33]:
# To remove a column from the DataFrame:

df_excel.drop("Store Capacity", axis=1, inplace= False)  # axis 0 rows, axis 1 columns
# To remove multiple columns, just pass a list with column names in drop method. example:  ["column_1","column_2"]
display(df_excel.head())


Unnamed: 0,Store Code,Store Name,GeoName ID,Country,City,Latitude - Longtitude,Latitude,Longtitude,Store Capacity,Trendy Customer %
0,1,Store 1,3432043,Argentina,La Plata,"-34.92145, -57.95453",-34.92145,-57.95453,Medium Stores,0.49
300,2,Store 2,3848950,Argentina,La Rioja,"-29.41105, -66.85067",-29.41105,-66.85067,Medium Stores,0.55
1,3,Store 3,3835994,Argentina,Santa Rosa,"-36.61667, -64.28333",-36.61667,-64.28333,Small Stores,0.26
301,4,Store 4,3436043,Argentina,Berazategui,"-34.76531, -58.21278",-34.76531,-58.21278,Big Stores,0.66
2,5,Store 5,3836277,Argentina,Santa Fe,"-31.64881, -60.70868",-31.64881,-60.70868,Small Stores,0.45


### 4) HANDLING MISSING VALUES

In [34]:
# number of missing values in each column
display(df_excel.isnull().sum())

Store Code               0
Store Name               0
GeoName ID               0
Country                  0
City                     0
Latitude - Longtitude    0
Latitude                 0
Longtitude               0
Store Capacity           0
Trendy Customer %        0
dtype: int64

In [35]:
# when there is a string value like "Missing" or "NA" for null values, replace them with np.nan first.
df_excel.replace("Missing", np.nan, inplace=True)

In [38]:
# REMOVING MISSING VALUES

# drop rows if any of the row values is NaN.  # any = OR , all = And
# change axis to "columns" to check null values in column wise
# Use subset to null check only certain columns
# you can set threshold for null values thresh=2

df_excel.dropna(axis='index', how='any', subset=["Country", "Trendy Customer %"])   

Unnamed: 0,Store Code,Store Name,GeoName ID,Country,City,Latitude - Longtitude,Latitude,Longtitude,Store Capacity,Trendy Customer %
0,1,Store 1,3432043,Argentina,La Plata,"-34.92145, -57.95453",-34.92145,-57.95453,Medium Stores,0.49
300,2,Store 2,3848950,Argentina,La Rioja,"-29.41105, -66.85067",-29.41105,-66.85067,Medium Stores,0.55
1,3,Store 3,3835994,Argentina,Santa Rosa,"-36.61667, -64.28333",-36.61667,-64.28333,Small Stores,0.26
301,4,Store 4,3436043,Argentina,Berazategui,"-34.76531, -58.21278",-34.76531,-58.21278,Big Stores,0.66
2,5,Store 5,3836277,Argentina,Santa Fe,"-31.64881, -60.70868",-31.64881,-60.70868,Small Stores,0.45


In [None]:
# FILLING MISSING VALUES

# fill missing values in a certain column with specified value
df_excel["Trendy Customer %"].fillna(value=0, inplace=True)


In [39]:
# you can calculate the mean and use it as replace value
new_value = df_excel["Trendy Customer %"].mean()
df_excel["Trendy Customer %"].fillna(value=new_value, inplace=True)


In [40]:
# Fill missing values in different columns with different methods by using a dictionary
new_df = df_excel.fillna({
                        "Country": "Unknown Country",
                        "Trendy Customer %": 0,
                        "City": "Unknown City"}, inplace=True)

In [41]:
# other methods for filling missing values :
# limit decides how many rows will be affected by each filling at a time
df_excel.fillna(method="bfill", limit=1, inplace=True)    
df_excel.fillna(method="ffill", limit=1, inplace=True)
df_excel.interpolate(method= "linear")


Unnamed: 0,Store Code,Store Name,GeoName ID,Country,City,Latitude - Longtitude,Latitude,Longtitude,Store Capacity,Trendy Customer %
0,1,Store 1,3432043,Argentina,La Plata,"-34.92145, -57.95453",-34.92145,-57.95453,Medium Stores,0.49
300,2,Store 2,3848950,Argentina,La Rioja,"-29.41105, -66.85067",-29.41105,-66.85067,Medium Stores,0.55
1,3,Store 3,3835994,Argentina,Santa Rosa,"-36.61667, -64.28333",-36.61667,-64.28333,Small Stores,0.26
301,4,Store 4,3436043,Argentina,Berazategui,"-34.76531, -58.21278",-34.76531,-58.21278,Big Stores,0.66
2,5,Store 5,3836277,Argentina,Santa Fe,"-31.64881, -60.70868",-31.64881,-60.70868,Small Stores,0.45
...,...,...,...,...,...,...,...,...,...,...
299,996,Store 996,4281730,United States,Wichita,"37.69224, -97.33754",37.69224,-97.33754,Medium Stores,0.17
998,997,Store 997,4471025,United States,High Point,"35.95569, -80.00532",35.95569,-80.00532,Small Stores,0.82
578,998,Store 998,4684888,United States,Dallas,"32.78306, -96.80667",32.78306,-96.80667,Big Stores,0.76
579,999,Store 999,4951305,United States,South Boston,"42.33343, -71.04949",42.33343,-71.04949,Medium Stores,0.77


### 5) HANDLING DUPLICATE VALUES

In [43]:
# Count unique combinations of columns.
df_excel.value_counts("Country")

Country
China                     187
India                     144
United States             111
Brazil                     73
Japan                      71
Mexico                     43
Germany                    33
United Kingdom             32
Spain                      28
Turkey                     26
Egypt                      22
Canada                     18
Poland                     15
Italy                      14
France                     13
Argentina                  12
Sweden                     11
Austria                    10
New Zealand                 9
Belgium                     9
Romania                     9
Finland                     9
Morocco                     8
Portugal                    8
Netherlands                 8
Greece                      8
Kazakhstan                  7
Bulgaria                    7
Serbia                      7
Bosnia and Herzegovina      7
Australia                   6
Czech Republic              5
Switzerland                 5
No

In [50]:
# Return boolean Series denoting duplicate rows. .sum() to check the count of duplicates for each column
df_excel.duplicated(subset=["Country", "City"], keep='first').sum()

# check more on here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html

3

In [52]:
# Remove all duplicates:
df_excel.drop_duplicates(subset= ["Country", "Store Code"], keep="first", ignore_index=True, inplace = True)

# check more on here : https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html

### 6) JOIN

[Join documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.join.html)

In [None]:
# JOIN METHOD
df_excel.join(df_filtered1, on=None, how='left', lsuffix='', rsuffix='', sort=False)

# other:        Pass right DataFrame object or list of DataFrame objects.
# on:           Specify which index you wante to join on when you have multiple indexes.
# how:          Use to specify the join type. Accepts inner, left, right, outer.
# lsuffix:      Specify the left suffix string to column names
# rsuffix:      Specify the right suffix string to column names
# sort:         To specify the results to be sorted.

[Merge documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html)

In [54]:
# MERGE METHOD

technologies = {
    'Courses':["Spark","PySpark","Python","pandas"],
    'Fee' :[20000,25000,22000,30000],
    'Duration':['30days','40days','35days','50days'],
              }
index_labels=['r1','r2','r3','r4']
df1 = pd.DataFrame(technologies,index=index_labels)

technologies2 = {
    'Courses':["Spark","Java","Python","Go"],
    'Discount':[2000,2300,1200,2000]
              }
index_labels2=['r1','r6','r3','r5']
df2 = pd.DataFrame(technologies2,index=index_labels2)

df3=pd.merge(df1,df2, left_on='Courses', right_on='Courses')


# right	 	    Required. A DataFrame, a Series to merge with
# how	        'left' 'right' 'outer''inner' 'cross'	Optional. Default 'inner'. Specifies how to merge
# on		    Optional. Specifies in what level to do the merging
# left_on		Optional. Specifies in what level to do the merging on the DataFrame to the left
# right_on      Optional. Specifies in what level to do the merging on the DataFrame to the right
# left_index	Optional. Default False. Whether to use the index from the left DataFrame as join key or not
# right_index	Optional. Default False. Whether to use the index from the right DataFrame as join key or not
# sort	        Optional. Default False. Specifies whether to sort the DataFrame by the join key or not
# suffixes		Optional. Default '_x', '_y''. Specifies a list of strings to add for overlapping columns
# copy		    Optional. Default True. Specifies whether to keep copies or not
# indicator		Optional. Default False. Specifies whether to add a column in the DataFrame with information about the source of each row
# validate		Optional. Checks if the mergin is of a specified type




> DIFFERENCE BETWEEN JOIN AND MERGE

- JOIN ONLY USES INDEX, MERGE ALLOWS YOU TO CHOOSE COLUMNS TO JOIN ON

- JOIN by default performs left join.

- MERGE additionally supports the cross join.


### 7) GROUP BY AND AGGREGATE FUNCTIONS

### 8) PIVOT

### 9) CONCATENATE

### 10) STRING FUNCTIONS

### 11) DATE FUNCTIONS

### 12) DATAFRAME WRITE FUNCTIONS