## 1.handling missing data

In [1]:
import pandas as pd 
import numpy as np 


In [2]:
ps = pd.Series([23,55,np.nan,None])
ps

0    23.0
1    55.0
2     NaN
3     NaN
dtype: float64

In [3]:
ps.isnull()

0    False
1    False
2     True
3     True
dtype: bool

In [4]:
ps.isna()

0    False
1    False
2     True
3     True
dtype: bool

## 2.filtering out missing data 


In [5]:
ps


0    23.0
1    55.0
2     NaN
3     NaN
dtype: float64

### first methode

In [6]:
ps.dropna()

0    23.0
1    55.0
dtype: float64

### seconde methode

In [7]:
ps[ps.notnull()]

0    23.0
1    55.0
dtype: float64

In [8]:
# make the change in place

ps.dropna(inplace = True)
ps

0    23.0
1    55.0
dtype: float64

## now working on dataFrames


In [9]:
# using dropna() on DataFrames means that any row containing nan 
# will be droped 

df = pd.DataFrame([[1,2,None,4], [33,33,34,12] ,[None,23,11,22]])
df

Unnamed: 0,0,1,2,3
0,1.0,2,,4
1,33.0,33,34.0,12
2,,23,11.0,22


In [10]:
# we have here two choices deleting the rows or the columns containing nan

In [11]:
# first the rows 
df.dropna()

Unnamed: 0,0,1,2,3
1,33.0,33,34.0,12


In [12]:
#or the columns 
df.dropna(axis = 1)

Unnamed: 0,1,3
0,2,4
1,33,12
2,23,22


In [13]:
# we can drop only the rows which have nan values on all its columns
df2 = pd.DataFrame([[1,2,None] , [None,None,None], [3,4,None]])
df2

Unnamed: 0,0,1,2
0,1.0,2.0,
1,,,
2,3.0,4.0,


In [14]:
df2.dropna(how = 'all')

Unnamed: 0,0,1,2
0,1.0,2.0,
2,3.0,4.0,


In [15]:
# or droping a column 
df2.dropna(how = 'all' , axis = 1)

Unnamed: 0,0,1
0,1.0,2.0
1,,
2,3.0,4.0


In [16]:
# we can also sepisify the nbr of Nan that we can have

df2

Unnamed: 0,0,1,2
0,1.0,2.0,
1,,,
2,3.0,4.0,


In [17]:
df3 = pd.DataFrame([[1,2,3,None] , [1,2,None,None] ,[1,None,None,None]])
df3

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,
1,1,2.0,,
2,1,,,


In [18]:
# we keep the rows with less that 2 Nan
#the nbr after thresh is the nbr of valid ovservations( not nan)
df3.dropna(thresh = 2)

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,
1,1,2.0,,


## 3.filling in missing data

In [19]:
df

Unnamed: 0,0,1,2,3
0,1.0,2,,4
1,33.0,33,34.0,12
2,,23,11.0,22


In [20]:
df.fillna(0)
# the missing values were filled by 0

Unnamed: 0,0,1,2,3
0,1.0,2,0.0,4
1,33.0,33,34.0,12
2,0.0,23,11.0,22


In [21]:
df.fillna({0:0,2:1})

Unnamed: 0,0,1,2,3
0,1.0,2,1.0,4
1,33.0,33,34.0,12
2,0.0,23,11.0,22


In [22]:
df.fillna(method = 'bfill')

Unnamed: 0,0,1,2,3
0,1.0,2,34.0,4
1,33.0,33,34.0,12
2,,23,11.0,22


In [23]:
# best practice is using the mean 
df.fillna(df.mean())

Unnamed: 0,0,1,2,3
0,1.0,2,22.5,4
1,33.0,33,34.0,12
2,17.0,23,11.0,22


## 4.remove duplicate entries



In [24]:
df4 = pd.DataFrame({"name" : ['walid','hamza','bora','walid' ,'bora'] 
                    , "age" : [21,20,21,21,21]})
df4

Unnamed: 0,name,age
0,walid,21
1,hamza,20
2,bora,21
3,walid,21
4,bora,21


In [25]:
# first checking if duplicates exists 
df4.duplicated().any()

True

In [26]:
# removing duplicates 
df4.drop_duplicates(inplace = True)
df4

Unnamed: 0,name,age
0,walid,21
1,hamza,20
2,bora,21


## 5.replacing values

In [27]:
df4 = pd.Series([1,2,3,999,-999,5,999])
df4

0      1
1      2
2      3
3    999
4   -999
5      5
6    999
dtype: int64

In [28]:
# replacing abnormal values with nan 
df4.replace([999,-999] ,np.nan, inplace = True)
df4

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    5.0
6    NaN
dtype: float64

In [29]:
df5 = pd.DataFrame(['oui', 'non','oui','oui','oui','non'],
                 columns = ["vaccinated"] )
df5

Unnamed: 0,vaccinated
0,oui
1,non
2,oui
3,oui
4,oui
5,non


In [30]:
df5.replace({'oui': 'vaccinated' , 'non' :'not vaccinated'})

Unnamed: 0,vaccinated
0,vaccinated
1,not vaccinated
2,vaccinated
3,vaccinated
4,vaccinated
5,not vaccinated


## 6.renaming columns and index labels

In [31]:
df6 = pd.DataFrame(np.arange(1,13).reshape(4,3),
                  index = ['black' , 'green' ,'yellow' ,'pink'],
                  columns = ['java', 'pyhton' ,'c++'])
df6

Unnamed: 0,java,pyhton,c++
black,1,2,3
green,4,5,6
yellow,7,8,9
pink,10,11,12


In [32]:
# renaming an index row
df6.rename(index={'pink' : 'red'} ,inplace = True)
df6

Unnamed: 0,java,pyhton,c++
black,1,2,3
green,4,5,6
yellow,7,8,9
red,10,11,12


In [33]:
# renaming columns 
df6.rename(columns = {'c++' : 'c'} , inplace = True)
df6

Unnamed: 0,java,pyhton,c
black,1,2,3
green,4,5,6
yellow,7,8,9
red,10,11,12


In [34]:
df6.index = df6.index.str.title()
df6

Unnamed: 0,java,pyhton,c
Black,1,2,3
Green,4,5,6
Yellow,7,8,9
Red,10,11,12


In [35]:
df6.columns = df6.columns.str.upper()
df6

Unnamed: 0,JAVA,PYHTON,C
Black,1,2,3
Green,4,5,6
Yellow,7,8,9
Red,10,11,12


## 7.filtering oultliers

In [41]:
df7 = pd.read_csv("population_by_country_2020.csv")
df7.head()


Unnamed: 0,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,China,1438207241,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
1,India,1377233523,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
2,United States,330610570,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
3,Indonesia,272931713,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
4,Pakistan,219992900,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %


In [50]:
df7.rename(columns= {'Med. Age':'medAge'} , inplace = True)
df7.head()

Unnamed: 0,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,medAge,Urban Pop %,World Share
0,China,1438207241,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
1,India,1377233523,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
2,United States,330610570,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
3,Indonesia,272931713,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
4,Pakistan,219992900,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %


 ## 8.shuffling and Random Sampling 

In [58]:
ds8 = pd.Series(np.random.randint(12 , size = 10))
ds8

0     3
1     9
2     2
3     4
4    11
5     3
6    10
7     6
8     4
9     0
dtype: int32

In [63]:
# shuffling  
# frac =1 means keeping 100 % of data 
ds8.sample(frac = 1 )

2     2
1     9
7     6
6    10
8     4
5     3
4    11
9     0
3     4
0     3
dtype: int32

In [64]:
ds8.sample(frac = 1 ).reset_index(drop = True)

0     9
1    10
2    11
3     2
4     0
5     4
6     4
7     6
8     3
9     3
dtype: int32

In [65]:
ds8.sample(frac =.5)

4    11
9     0
2     2
1     9
5     3
dtype: int32

In [68]:
df8= pd.read_csv("population_by_country_2020.csv")
df8

Unnamed: 0,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,China,1438207241,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
1,India,1377233523,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
2,United States,330610570,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
3,Indonesia,272931713,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
4,Pakistan,219992900,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...,...
230,Montserrat,4991,0.06 %,3,50,100,,N.A.,N.A.,10 %,0.00 %
231,Falkland Islands,3458,3.05 %,103,0,12170,,N.A.,N.A.,66 %,0.00 %
232,Niue,1624,0.68 %,11,6,260,,N.A.,N.A.,46 %,0.00 %
233,Tokelau,1354,1.27 %,17,136,10,,N.A.,N.A.,0 %,0.00 %


In [70]:
df8.sample(frac= .1).reset_index(drop = True)

Unnamed: 0,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,Turkey,84153250,1.09 %,909452,110,769630,283922.0,2.1,32,76 %,1.08 %
1,Germany,83730223,0.32 %,266897,240,348560,543822.0,1.6,46,76 %,1.07 %
2,San Marino,33917,0.21 %,71,566,60,,N.A.,N.A.,97 %,0.00 %
3,Trinidad and Tobago,1398579,0.32 %,4515,273,5130,-800.0,1.7,36,52 %,0.02 %
4,Guyana,785788,0.48 %,3786,4,196850,-6000.0,2.5,27,27 %,0.01 %
5,Côte d'Ivoire,26239250,2.57 %,661730,83,318000,-8000.0,4.7,19,51 %,0.34 %
6,Sint Maarten,42776,1.15 %,488,1261,34,,N.A.,N.A.,96 %,0.00 %
7,Latvia,1890218,-1.08 %,-20545,30,62200,-14837.0,1.7,44,69 %,0.02 %
8,Chile,19082804,0.87 %,164163,26,743532,111708.0,1.7,35,85 %,0.25 %
9,India,1377233523,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %


In [86]:
df8[df8['Country (or dependency)'] == 'Morocco']

Unnamed: 0,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
39,Morocco,36820713,1.20 %,438791,83,446300,-51419.0,2.4,30,64 %,0.47 %


## 9.Dummy variables

In [88]:
# dummy variables are used for categorical data (ex : male,female)

df9 = pd.DataFrame({'name': ["walid","hamza","bora"], 
                    'validation':["ratt","ratt","validee"] })
df9

Unnamed: 0,name,validation
0,walid,ratt
1,hamza,ratt
2,bora,validee


In [90]:
validation_dummies = pd.get_dummies(df9['validation'])
validation_dummies

Unnamed: 0,ratt,validee
0,1,0
1,1,0
2,0,1


In [92]:
# now we add it to the original dataframe using the join methode
df9 =df9.join(validation_dummies)

In [93]:
df9

Unnamed: 0,name,validation,ratt,validee
0,walid,ratt,1,0
1,hamza,ratt,1,0
2,bora,validee,0,1


## 10. String object methods 

In [95]:
text1 = "hello, world , my name is walid"
text1.split(',')

['hello', ' world ', ' my name is walid']

In [98]:
# getting rid of the spaces 
words = [val.strip() for val in text1.split(',') ]
words

['hello', 'world', 'my name is walid']