# Working with Pandas
    - Work with dataframes & Series
    - Selection, filtering, manipulation
    - data import / export with pandas
    - data wrangling


Pandas has two data types:
- pandas series: 1D data
- pandas dataframe: 2D data

In [2]:
import pandas as pd

In [3]:
# Pandas Series
s = pd.Series([12,56,23,45,78],name='Age')
s

0    12
1    56
2    23
3    45
4    78
Name: Age, dtype: int64

In [4]:
type(s)

pandas.core.series.Series

In [5]:
# Creating a data frame
data = {'Name':['Anshu','Carlo','Ahmad','Jenny','Jasmine','Jennifer'],
'Gender':['Male','Male','Male','Female','Female','Female'],
'Age':[45,25,22,35,42,32],
'Current City':['DXB','KUL','HXB','DEL','MAA','AUH']}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Gender,Age
0,Anshu,Male,45
1,Carlo,Male,25
2,Ahmad,Male,22
3,Jenny,Female,35
4,Jasmine,Female,42
5,Jennifer,Female,32


In [6]:
type(df)

pandas.core.frame.DataFrame

#### Selection with dataframes

In [11]:
df

Unnamed: 0,Name,Gender,Age
0,Anshu,Male,45
1,Carlo,Male,25
2,Ahmad,Male,22
3,Jenny,Female,35
4,Jasmine,Female,42
5,Jennifer,Female,32


In [12]:
df.head()

Unnamed: 0,Name,Gender,Age
0,Anshu,Male,45
1,Carlo,Male,25
2,Ahmad,Male,22
3,Jenny,Female,35
4,Jasmine,Female,42


In [13]:
df.head(2)

Unnamed: 0,Name,Gender,Age
0,Anshu,Male,45
1,Carlo,Male,25


In [14]:
df.tail()

Unnamed: 0,Name,Gender,Age
1,Carlo,Male,25
2,Ahmad,Male,22
3,Jenny,Female,35
4,Jasmine,Female,42
5,Jennifer,Female,32


In [15]:
df.tail(3)

Unnamed: 0,Name,Gender,Age
3,Jenny,Female,35
4,Jasmine,Female,42
5,Jennifer,Female,32


In [7]:
# accessing single column
df['Name']

0       Anshu
1       Carlo
2       Ahmad
3       Jenny
4     Jasmine
5    Jennifer
Name: Name, dtype: object

In [16]:
df.Name # only works if column does not have a whitespace character

0       Anshu
1       Carlo
2       Ahmad
3       Jenny
4     Jasmine
5    Jennifer
Name: Name, dtype: object

In [17]:
type(df['Name'])

pandas.core.series.Series

In [9]:
df[['Name','Age']] # use two square brackets to access multiple columns from a dataframe

Unnamed: 0,Name,Age
0,Anshu,45
1,Carlo,25
2,Ahmad,22
3,Jenny,35
4,Jasmine,42
5,Jennifer,32


In [10]:
type(df[['Name','Age']])

pandas.core.frame.DataFrame

In [19]:
# accessing data row wise
df[2:5]


Unnamed: 0,Name,Gender,Age
2,Ahmad,Male,22
3,Jenny,Female,35
4,Jasmine,Female,42


In [20]:
df['Age'][2:5]

2    22
3    35
4    42
Name: Age, dtype: int64

In [24]:
# fetch data based on position of cols and rows
df.iloc[2:6,1:3]

Unnamed: 0,Gender,Age
2,Male,22
3,Female,35
4,Female,42
5,Female,32


#### Filtering

In [25]:
df

Unnamed: 0,Name,Gender,Age
0,Anshu,Male,45
1,Carlo,Male,25
2,Ahmad,Male,22
3,Jenny,Female,35
4,Jasmine,Female,42
5,Jennifer,Female,32


In [26]:
# numeric filter
df.Age<35

0    False
1     True
2     True
3    False
4    False
5     True
Name: Age, dtype: bool

In [27]:
df[df.Age<35]

Unnamed: 0,Name,Gender,Age
1,Carlo,Male,25
2,Ahmad,Male,22
5,Jennifer,Female,32


In [28]:
# categoric filter
df[df.Gender=='Male']

Unnamed: 0,Name,Gender,Age
0,Anshu,Male,45
1,Carlo,Male,25
2,Ahmad,Male,22


In [29]:
# combine multiple conditions
# AND
df[df.Age<35][df.Gender=='Female']

  df[df.Age<35][df.Gender=='Female']


Unnamed: 0,Name,Gender,Age
5,Jennifer,Female,32


In [30]:
# AND logic
df[ (df.Age<35) & (df.Gender=='Male')]

Unnamed: 0,Name,Gender,Age
1,Carlo,Male,25
2,Ahmad,Male,22


In [31]:
# OR logic
df[ (df.Age<35) | (df.Gender=='Male')]

Unnamed: 0,Name,Gender,Age
0,Anshu,Male,45
1,Carlo,Male,25
2,Ahmad,Male,22
5,Jennifer,Female,32


In [32]:
#sort values
df.sort_values(by='Age')

Unnamed: 0,Name,Gender,Age
2,Ahmad,Male,22
1,Carlo,Male,25
5,Jennifer,Female,32
3,Jenny,Female,35
4,Jasmine,Female,42
0,Anshu,Male,45


In [33]:
#sort values
df.sort_values(by='Age').head(1)

Unnamed: 0,Name,Gender,Age
2,Ahmad,Male,22


In [34]:
df[df.Age==df.Age.min()]

Unnamed: 0,Name,Gender,Age
2,Ahmad,Male,22


#### Wrangling and manipulation


In [35]:
df

Unnamed: 0,Name,Gender,Age
0,Anshu,Male,45
1,Carlo,Male,25
2,Ahmad,Male,22
3,Jenny,Female,35
4,Jasmine,Female,42
5,Jennifer,Female,32


In [37]:
# Creating a data frame
data = {'Name':['James','John','Jessica'],
'Gender':['Male','Male','Female'],
'Age':[45,25,32]}

df2 = pd.DataFrame(data)
df2

Unnamed: 0,Name,Gender,Age
0,James,Male,45
1,John,Male,25
2,Jessica,Female,32


In [38]:
df = pd.concat([df,df2])
df

Unnamed: 0,Name,Gender,Age
0,Anshu,Male,45
1,Carlo,Male,25
2,Ahmad,Male,22
3,Jenny,Female,35
4,Jasmine,Female,42
5,Jennifer,Female,32
0,James,Male,45
1,John,Male,25
2,Jessica,Female,32


In [39]:
df['City'] = ['DXB','KUL','AUH','CGK','HGK','HCM','AUH','DXB','MAA']
df

Unnamed: 0,Name,Gender,Age,City
0,Anshu,Male,45,DXB
1,Carlo,Male,25,KUL
2,Ahmad,Male,22,AUH
3,Jenny,Female,35,CGK
4,Jasmine,Female,42,HGK
5,Jennifer,Female,32,HCM
0,James,Male,45,AUH
1,John,Male,25,DXB
2,Jessica,Female,32,MAA


In [40]:
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,Name,Gender,Age,City
0,Anshu,Male,45,DXB
1,Carlo,Male,25,KUL
2,Ahmad,Male,22,AUH
3,Jenny,Female,35,CGK
4,Jasmine,Female,42,HGK
5,Jennifer,Female,32,HCM
6,James,Male,45,AUH
7,John,Male,25,DXB
8,Jessica,Female,32,MAA


In [41]:
df['Age_cat'] = pd.cut(df['Age'],bins=[0,25,40,90],labels=['Young','Adult','Old'])
df

Unnamed: 0,Name,Gender,Age,City,Age_cat
0,Anshu,Male,45,DXB,Old
1,Carlo,Male,25,KUL,Young
2,Ahmad,Male,22,AUH,Young
3,Jenny,Female,35,CGK,Adult
4,Jasmine,Female,42,HGK,Old
5,Jennifer,Female,32,HCM,Adult
6,James,Male,45,AUH,Old
7,John,Male,25,DXB,Young
8,Jessica,Female,32,MAA,Adult


In [49]:
# drop a column
df.drop(columns=['Age_cat'],inplace=True)

In [50]:
df

Unnamed: 0,Name,Gender,Age,City
0,Anshu,Male,45,DXB
1,Carlo,Male,25,KUL
2,Ahmad,Male,22,AUH
3,Jenny,Female,35,CGK
4,Jasmine,Female,42,HGK
5,Jennifer,Female,32,HCM
6,James,Male,45,AUH
7,John,Male,25,DXB
8,Jessica,Female,32,MAA


In [51]:
# drop a column
df.drop(index=[5,6,7],inplace=True)
df

Unnamed: 0,Name,Gender,Age,City
0,Anshu,Male,45,DXB
1,Carlo,Male,25,KUL
2,Ahmad,Male,22,AUH
3,Jenny,Female,35,CGK
4,Jasmine,Female,42,HGK
8,Jessica,Female,32,MAA


#### Statistical Analysis

In [53]:
df.describe(include='all')

Unnamed: 0,Name,Gender,Age,City
count,6,6,6.0,6
unique,6,2,,6
top,Anshu,Male,,DXB
freq,1,3,,1
mean,,,33.5,
std,,,9.093954,
min,,,22.0,
25%,,,26.75,
50%,,,33.5,
75%,,,40.25,


In [54]:
df.Age.min()

22

In [55]:
df.Age.max()

45

In [56]:
df.Age.mean()

33.5

In [57]:
df.Age.median()

33.5

In [61]:
df.Gender.mode()

0    Female
1      Male
dtype: object

In [62]:
df.Age.var()

82.7

In [63]:
df.Age.std()

9.093954035511725

In [64]:
df.Age.skew()

0.0

### Data Import / Export with Pandas

In [67]:
# loading data from a CSV file
df = pd.read_csv(r"D:\AI\data\datasets-1\Bank_churn_modelling.csv")
df.shape

(10000, 14)

In [68]:
df.head(2)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0


In [70]:
# loading data from an Excel file
df = pd.read_excel(r"D:\AI\data\datasets-1\Bank_churn_modelling.xlsx")
df.shape

(10000, 14)

In [71]:
df.head(2)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [72]:
# loading data from a json file
df = pd.read_json(r"D:\AI\data\datasets-1\server-metrics.json")
df.shape

(20, 6)

In [73]:
df.head()

Unnamed: 0,id,dates,Accepted,Rejected,App,Server
0,0,1525046400000,218,182,App_4,Server_02
1,1,1525132800000,2592,182,App_3,Server_02
2,2,1525219200000,509,439,App_4,Server_0
3,3,1525305600000,2439,53,App_5,Server_01
4,4,1525392000000,824,444,App_5,Server_0


In [74]:
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory"
df_list = pd.read_html(url)
len(df_list)

70

In [75]:
df_list[0]

Unnamed: 0,COVID-19 pandemic,COVID-19 pandemic.1
0,"Confirmed deaths per 100,000 population,[1][no...","Confirmed deaths per 100,000 population,[1][no..."
1,Disease,COVID-19
2,Virus strain,SARS-CoV-2
3,Source,"Probably bats, possibly via pangolins[2][3]"
4,Location,Worldwide
5,First outbreak,China[4]
6,Index case,"Wuhan, Hubei, China.mw-parser-output .geo-defa..."
7,Date,"17 November 2019[4] – present(2 years, 10 mont..."
8,Confirmed cases,"612,314,658[5]"
9,Deaths,"6,527,372[5]"


In [87]:
# get only those tables which has more than 10 rows
final_list = []
for df in df_list:
    if df.shape[0]>10:
        final_list.append(df)

In [88]:
len(final_list)

18

In [93]:
final_list[4]

Unnamed: 0,Region[29],Total cases,Total deaths,Cases per million,Deaths per million,Current weekly cases,Current weekly deaths,Population millions,Vaccinated %[30]
0,European Union,163770077,1138276,366114,2545,648555,2543,447,75.0
1,North America,98420934,1090029,266863,2956,523615,2936,369,74.6
2,Other Europe,55795334,491883,238808,2105,130330,812,234,61.0
3,South America,63162506,1300158,146932,3025,118462,1168,430,81.2
4,Russia and Central Asia,23934534,426857,101077,1803,339111,668,237,54.4
5,Central America,10976906,378719,61095,2108,37351,292,180,67.3
6,Middle East,22266626,236911,85315,908,28133,253,261,51.7
7,Oceania and islands in East Asia,47860321,292366,82894,506,1031665,2590,577,71.7
8,Caribbean,2573339,25811,59437,596,4997,55,43,46.1
9,South Asia,50123528,617393,27001,333,45511,211,1856,68.5


In [94]:
df = final_list[4]

In [97]:
# export a dataframe to a file/source
df.to_excel(r"D:\ai\Python-Programming\heretech\coivd_world_data.xlsx")

### Accessing/extracting data from HTML

In [115]:
import pandas as pd

In [98]:
url = "https://en.wikipedia.org/wiki/FIFA"
df_collection = pd.read_html(url) # read_html returns a list of dataframes
len(df_collection)

31

In [106]:
df_collection[7]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Competition,,Year,Champions,Title,Runners-up,,Next edition[55],,
1,National teams,National teams,National teams,National teams,National teams,National teams,National teams,National teams,National teams,
2,FIFA World Cup,,2018 (Final),France,2nd,Croatia,,2022 (Final),,
3,Men's Olympic Football Tournament (U-23),,2020 (Final),Brazil,2nd,Spain,,2024 (Final),,
4,FIFA U-20 World Cup,,2019 (Final),Ukraine,1st,South Korea,,2023 (Final),,
5,FIFA U-17 World Cup,,2019 (Final),Brazil,4th,Mexico,,2023 (Final),,
6,FIFA Futsal World Cup,,2021 (Final),Portugal,1st,Argentina,,2024 (Final),,
7,Men's Youth Olympic Futsal Tournament (U-20),,2018 (Final),Brazil,1st,Russia,,2026,,
8,FIFA Beach Soccer World Cup (see the BSWW),,2021 (Final),Russia,3rd,Japan,,2023 (Final),,
9,FIFA Arab Cup (senior teams of the UAFA (Arab ...,,2021 (Final),Algeria,1st,Tunisia,,TBD,,


In [107]:
mydf = df_collection[7]
mydf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Competition,,Year,Champions,Title,Runners-up,,Next edition[55],,
1,National teams,National teams,National teams,National teams,National teams,National teams,National teams,National teams,National teams,
2,FIFA World Cup,,2018 (Final),France,2nd,Croatia,,2022 (Final),,
3,Men's Olympic Football Tournament (U-23),,2020 (Final),Brazil,2nd,Spain,,2024 (Final),,
4,FIFA U-20 World Cup,,2019 (Final),Ukraine,1st,South Korea,,2023 (Final),,
5,FIFA U-17 World Cup,,2019 (Final),Brazil,4th,Mexico,,2023 (Final),,
6,FIFA Futsal World Cup,,2021 (Final),Portugal,1st,Argentina,,2024 (Final),,
7,Men's Youth Olympic Futsal Tournament (U-20),,2018 (Final),Brazil,1st,Russia,,2026,,
8,FIFA Beach Soccer World Cup (see the BSWW),,2021 (Final),Russia,3rd,Japan,,2023 (Final),,
9,FIFA Arab Cup (senior teams of the UAFA (Arab ...,,2021 (Final),Algeria,1st,Tunisia,,TBD,,


In [108]:
mydf.to_excel("D:\ai\fifadata.xlsx")

In [118]:
# check the current working direcrory
import os
os.getcwd()

'd:\\AI\\Python-Programming\\HereTech'

In [116]:
m = "hello from python \new world"
print(m)

hello from python 
ew world


In [117]:
m = r"hello from python \new world"
print(m)

hello from python \new world


### Data Aggregation with Pandas

In [129]:
df1 = pd.read_csv(r"D:\AI\data\datasets-1\regiment.csv",index_col='index')
df1.shape

(12, 5)

In [123]:
df1

Unnamed: 0_level_0,regiment,company,name,preTestScore,postTestScore
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,70


In [124]:
df2 = pd.read_csv(r"D:\AI\data\datasets-1\regiment2.csv")
df2

Unnamed: 0,name,date_of_birth
0,Miller,12-05-1985
1,Jacobson,30-05-1987
2,Ali,04-06-1990
3,Milner,01-08-1989
4,Cooze,12-09-1985
5,Jacon,30-05-1986
6,Ryaner,04-02-1990
7,Sone,01-01-1989
8,Sloan,12-05-1995
9,Piger,30-05-1986


In [125]:
#create a dataframe by merging data from regiment.csv and regiment2.csv using name column
df = pd.merge(left=df1,right=df2,on='name',how='inner')
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,date_of_birth
0,Nighthawks,1st,Miller,4,25,12-05-1985
1,Nighthawks,1st,Jacobson,24,94,30-05-1987
2,Nighthawks,2nd,Ali,31,57,04-06-1990
3,Nighthawks,2nd,Milner,2,62,01-08-1989
4,Dragoons,1st,Cooze,3,70,12-09-1985
5,Dragoons,1st,Jacon,4,25,30-05-1986
6,Dragoons,2nd,Ryaner,24,94,04-02-1990
7,Dragoons,2nd,Sone,31,57,01-01-1989
8,Scouts,1st,Sloan,2,62,12-05-1995
9,Scouts,1st,Piger,3,70,30-05-1986


In [130]:
df1.drop(index=[4,5,9,11],inplace=True)
df1

Unnamed: 0_level_0,regiment,company,name,preTestScore,postTestScore
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
10,Scouts,2nd,Riani,2,62


In [132]:
df2.drop(index=[1,7],inplace=True)
df2

Unnamed: 0,name,date_of_birth
0,Miller,12-05-1985
2,Ali,04-06-1990
3,Milner,01-08-1989
4,Cooze,12-09-1985
5,Jacon,30-05-1986
6,Ryaner,04-02-1990
8,Sloan,12-05-1995
9,Piger,30-05-1986
10,Riani,09-03-1990
11,Alii,01-06-1979


In [133]:
#inner join
mydf = pd.merge(left=df1,right=df2,on='name',how='inner')
mydf

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,date_of_birth
0,Nighthawks,1st,Miller,4,25,12-05-1985
1,Nighthawks,2nd,Ali,31,57,04-06-1990
2,Nighthawks,2nd,Milner,2,62,01-08-1989
3,Dragoons,2nd,Ryaner,24,94,04-02-1990
4,Scouts,1st,Sloan,2,62,12-05-1995
5,Scouts,2nd,Riani,2,62,09-03-1990


In [134]:
#left join
mydf = pd.merge(left=df1,right=df2,on='name',how='left')
mydf

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,date_of_birth
0,Nighthawks,1st,Miller,4,25,12-05-1985
1,Nighthawks,1st,Jacobson,24,94,
2,Nighthawks,2nd,Ali,31,57,04-06-1990
3,Nighthawks,2nd,Milner,2,62,01-08-1989
4,Dragoons,2nd,Ryaner,24,94,04-02-1990
5,Dragoons,2nd,Sone,31,57,
6,Scouts,1st,Sloan,2,62,12-05-1995
7,Scouts,2nd,Riani,2,62,09-03-1990


In [135]:
#right join
mydf = pd.merge(left=df1,right=df2,on='name',how='right')
mydf

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,date_of_birth
0,Nighthawks,1st,Miller,4.0,25.0,12-05-1985
1,Nighthawks,2nd,Ali,31.0,57.0,04-06-1990
2,Nighthawks,2nd,Milner,2.0,62.0,01-08-1989
3,,,Cooze,,,12-09-1985
4,,,Jacon,,,30-05-1986
5,Dragoons,2nd,Ryaner,24.0,94.0,04-02-1990
6,Scouts,1st,Sloan,2.0,62.0,12-05-1995
7,,,Piger,,,30-05-1986
8,Scouts,2nd,Riani,2.0,62.0,09-03-1990
9,,,Alii,,,01-06-1979


In [136]:
#outer join
mydf = pd.merge(left=df1,right=df2,on='name',how='outer')
mydf

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,date_of_birth
0,Nighthawks,1st,Miller,4.0,25.0,12-05-1985
1,Nighthawks,1st,Jacobson,24.0,94.0,
2,Nighthawks,2nd,Ali,31.0,57.0,04-06-1990
3,Nighthawks,2nd,Milner,2.0,62.0,01-08-1989
4,Dragoons,2nd,Ryaner,24.0,94.0,04-02-1990
5,Dragoons,2nd,Sone,31.0,57.0,
6,Scouts,1st,Sloan,2.0,62.0,12-05-1995
7,Scouts,2nd,Riani,2.0,62.0,09-03-1990
8,,,Cooze,,,12-09-1985
9,,,Jacon,,,30-05-1986


In [138]:
df1 = pd.read_csv(r"D:\AI\data\datasets-1\regiment.csv",index_col='index')
df2 = pd.read_csv(r"D:\AI\data\datasets-1\regiment2.csv")
df = pd.merge(left=df1,right=df2,on='name',how='inner')
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,date_of_birth
0,Nighthawks,1st,Miller,4,25,12-05-1985
1,Nighthawks,1st,Jacobson,24,94,30-05-1987
2,Nighthawks,2nd,Ali,31,57,04-06-1990
3,Nighthawks,2nd,Milner,2,62,01-08-1989
4,Dragoons,1st,Cooze,3,70,12-09-1985
5,Dragoons,1st,Jacon,4,25,30-05-1986
6,Dragoons,2nd,Ryaner,24,94,04-02-1990
7,Dragoons,2nd,Sone,31,57,01-01-1989
8,Scouts,1st,Sloan,2,62,12-05-1995
9,Scouts,1st,Piger,3,70,30-05-1986


In [139]:
# which regiment is the best/most efficient regiment after the training?
df.postTestScore.mean()

62.333333333333336

In [140]:
df.groupby(by=['regiment'])['postTestScore'].mean()

regiment
Dragoons      61.5
Nighthawks    59.5
Scouts        66.0
Name: postTestScore, dtype: float64

In [141]:
# which company of which regiment is most efficient company?
df.groupby(by=['regiment','company'])['postTestScore'].mean()

regiment    company
Dragoons    1st        47.5
            2nd        75.5
Nighthawks  1st        59.5
            2nd        59.5
Scouts      1st        66.0
            2nd        66.0
Name: postTestScore, dtype: float64

In [143]:
# create a column representing improvement made by individual soldiers during the training
df['improvement'] = df['postTestScore'] - df['preTestScore']
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,date_of_birth,improvement
0,Nighthawks,1st,Miller,4,25,12-05-1985,21
1,Nighthawks,1st,Jacobson,24,94,30-05-1987,70
2,Nighthawks,2nd,Ali,31,57,04-06-1990,26
3,Nighthawks,2nd,Milner,2,62,01-08-1989,60
4,Dragoons,1st,Cooze,3,70,12-09-1985,67
5,Dragoons,1st,Jacon,4,25,30-05-1986,21
6,Dragoons,2nd,Ryaner,24,94,04-02-1990,70
7,Dragoons,2nd,Sone,31,57,01-01-1989,26
8,Scouts,1st,Sloan,2,62,12-05-1995,60
9,Scouts,1st,Piger,3,70,30-05-1986,67


In [144]:
# which regiment soldiers made most improvement and can be called as fastest learners ?
df.groupby(by=['regiment'])['improvement'].mean()


regiment
Dragoons      46.00
Nighthawks    44.25
Scouts        63.50
Name: improvement, dtype: float64

In [145]:
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,date_of_birth,improvement
0,Nighthawks,1st,Miller,4,25,12-05-1985,21
1,Nighthawks,1st,Jacobson,24,94,30-05-1987,70
2,Nighthawks,2nd,Ali,31,57,04-06-1990,26
3,Nighthawks,2nd,Milner,2,62,01-08-1989,60
4,Dragoons,1st,Cooze,3,70,12-09-1985,67
5,Dragoons,1st,Jacon,4,25,30-05-1986,21
6,Dragoons,2nd,Ryaner,24,94,04-02-1990,70
7,Dragoons,2nd,Sone,31,57,01-01-1989,26
8,Scouts,1st,Sloan,2,62,12-05-1995,60
9,Scouts,1st,Piger,3,70,30-05-1986,67


In [146]:
x = "Anshu"
x.upper()

'ANSHU'

In [147]:
def change_name(x):
    return x.upper()

change_name("Jacob")

'JACOB'

In [148]:
df['name'] = df['name'].apply(change_name)
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,date_of_birth,improvement
0,Nighthawks,1st,MILLER,4,25,12-05-1985,21
1,Nighthawks,1st,JACOBSON,24,94,30-05-1987,70
2,Nighthawks,2nd,ALI,31,57,04-06-1990,26
3,Nighthawks,2nd,MILNER,2,62,01-08-1989,60
4,Dragoons,1st,COOZE,3,70,12-09-1985,67
5,Dragoons,1st,JACON,4,25,30-05-1986,21
6,Dragoons,2nd,RYANER,24,94,04-02-1990,70
7,Dragoons,2nd,SONE,31,57,01-01-1989,26
8,Scouts,1st,SLOAN,2,62,12-05-1995,60
9,Scouts,1st,PIGER,3,70,30-05-1986,67


In [149]:
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,date_of_birth,improvement
0,Nighthawks,1st,MILLER,4,25,12-05-1985,21
1,Nighthawks,1st,JACOBSON,24,94,30-05-1987,70
2,Nighthawks,2nd,ALI,31,57,04-06-1990,26
3,Nighthawks,2nd,MILNER,2,62,01-08-1989,60
4,Dragoons,1st,COOZE,3,70,12-09-1985,67
5,Dragoons,1st,JACON,4,25,30-05-1986,21
6,Dragoons,2nd,RYANER,24,94,04-02-1990,70
7,Dragoons,2nd,SONE,31,57,01-01-1989,26
8,Scouts,1st,SLOAN,2,62,12-05-1995,60
9,Scouts,1st,PIGER,3,70,30-05-1986,67


In [152]:
df.dtypes

regiment         object
company          object
name             object
preTestScore      int64
postTestScore     int64
date_of_birth    object
improvement       int64
dtype: object

In [153]:
# standardize date of birth col to pandas datetime format
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])
df.dtypes

regiment                 object
company                  object
name                     object
preTestScore              int64
postTestScore             int64
date_of_birth    datetime64[ns]
improvement               int64
dtype: object

In [154]:
import datetime
today = datetime.datetime.today()
today

datetime.datetime(2022, 9, 22, 9, 20, 10, 411211)

In [155]:
def dob2age(dob):
    days = (today-dob).days
    years = days/365
    return round(years)

In [156]:
dob2age(pd.to_datetime("12-12-1988"))

34

In [164]:
(today - pd.to_datetime("12-12-1988")).seconds/3600

9.33611111111111

In [157]:
df['age'] = df['date_of_birth'].apply(dob2age)
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,date_of_birth,improvement,age
0,Nighthawks,1st,MILLER,4,25,1985-12-05,21,37
1,Nighthawks,1st,JACOBSON,24,94,1987-05-30,70,35
2,Nighthawks,2nd,ALI,31,57,1990-04-06,26,32
3,Nighthawks,2nd,MILNER,2,62,1989-01-08,60,34
4,Dragoons,1st,COOZE,3,70,1985-12-09,67,37
5,Dragoons,1st,JACON,4,25,1986-05-30,21,36
6,Dragoons,2nd,RYANER,24,94,1990-04-02,70,32
7,Dragoons,2nd,SONE,31,57,1989-01-01,26,34
8,Scouts,1st,SLOAN,2,62,1995-12-05,60,27
9,Scouts,1st,PIGER,3,70,1986-05-30,67,36


In [165]:
# which regiment has youngest soldiers
df.groupby(by=['regiment'])['age'].mean()


regiment
Dragoons      34.75
Nighthawks    34.50
Scouts        34.75
Name: age, dtype: float64

In [166]:
# which company of which regiment has youngest soldiers
df.groupby(by=['regiment','company'])['age'].mean()


regiment    company
Dragoons    1st        36.5
            2nd        33.0
Nighthawks  1st        36.0
            2nd        33.0
Scouts      1st        31.5
            2nd        38.0
Name: age, dtype: float64

In [167]:
# which company has best combination low age and high postTestScore
df.groupby(by=['regiment','company'])[['postTestScore','age']].mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,postTestScore,age
regiment,company,Unnamed: 2_level_1,Unnamed: 3_level_1
Dragoons,1st,47.5,36.5
Dragoons,2nd,75.5,33.0
Nighthawks,1st,59.5,36.0
Nighthawks,2nd,59.5,33.0
Scouts,1st,66.0,31.5
Scouts,2nd,66.0,38.0


In [169]:
df.groupby(by=['regiment','company'])[['postTestScore','age']].mean().sort_values(by=['age','postTestScore']).head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,postTestScore,age
regiment,company,Unnamed: 2_level_1,Unnamed: 3_level_1
Scouts,1st,66.0,31.5
