In [1]:
import pandas as pd

In [2]:
import numpy as np

In [26]:
df = pd.read_csv('weather_data.csv')
df

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2017,32 F,6 mph,Rain
1,01-02-2017,-99999,7 mph,Sunny
2,01-03-2017,28,-99999,Snow
3,01-04-2017,-99999,7,no event
4,01-05-2017,32 C,-99999,Rain
5,01-06-2017,31,2,Sunny
6,01-06-2017,34,5,no event


## we want to replace the -99999 values with the nan

In [4]:
new_df = df.replace(-99999,np.nan)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2017,32.0,6.0,Rain
1,01-02-2017,,7.0,Sunny
2,01-03-2017,28.0,,Snow
3,01-04-2017,,7.0,0
4,01-05-2017,32.0,,Rain
5,01-06-2017,31.0,2.0,Sunny
6,01-06-2017,34.0,5.0,0


## suppose we have to replace two values with nan (-99999 and -88888)
## then we will write like this--->    new_df = df.replace([-99999 , -88888],np.nan)

## replacing values with nan for specific column

In [8]:
df

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2017,32,6,Rain
1,01-02-2017,-99999,7,Sunny
2,01-03-2017,28,-99999,Snow
3,01-04-2017,-99999,7,0
4,01-05-2017,32,-99999,Rain
5,01-06-2017,31,2,Sunny
6,01-06-2017,34,5,0


In [11]:
new_df = df.replace({
        'temperature': -99999,              #we specified each and every column and the values which we want to replace
        'windspeed': -99999,               #whenever we want to change something by column, we give dictionary->keys:values
        'event': '0'
},np.nan)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2017,32.0,6.0,Rain
1,01-02-2017,,7.0,Sunny
2,01-03-2017,28.0,,Snow
3,01-04-2017,,7.0,
4,01-05-2017,32.0,,Rain
5,01-06-2017,31.0,2.0,Sunny
6,01-06-2017,34.0,5.0,


## lets do mapping if we simply want to replace some values by name

In [20]:
df

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2017,32,6,Rain
1,01-02-2017,-99999,7,Sunny
2,01-03-2017,28,-99999,Snow
3,01-04-2017,-99999,7,no event
4,01-05-2017,32,-99999,Rain
5,01-06-2017,31,2,Sunny
6,01-06-2017,34,5,no event


## we want 'no event' to be replaced with 'sunny' and all the '-99999' values to be replaced with 
## nan, in that particular column

In [23]:
new_df = df.replace({
        'no event':'Sunny',
        -99999 : np.nan       
})
new_df

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2017,32.0,6.0,Rain
1,01-02-2017,,7.0,Sunny
2,01-03-2017,28.0,,Snow
3,01-04-2017,,7.0,Sunny
4,01-05-2017,32.0,,Rain
5,01-06-2017,31.0,2.0,Sunny
6,01-06-2017,34.0,5.0,Sunny


## sometimes we have some unit of measure in some values, we need to remove that extra unit of measure, in order to have a better analysis on the dataset.

In [27]:
df

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2017,32 F,6 mph,Rain
1,01-02-2017,-99999,7 mph,Sunny
2,01-03-2017,28,-99999,Snow
3,01-04-2017,-99999,7,no event
4,01-05-2017,32 C,-99999,Rain
5,01-06-2017,31,2,Sunny
6,01-06-2017,34,5,no event


### whenever we want to remove patterns with some other value in a dataset, we use REGEX

In [29]:
new_df = df.replace('[A-Za-z]','',regex=True)    #replace all the a-z values with blank('') and then use regex= True
new_df

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2017,32,6,
1,01-02-2017,-99999,7,
2,01-03-2017,28,-99999,
3,01-04-2017,-99999,7,
4,01-05-2017,32,-99999,
5,01-06-2017,31,2,
6,01-06-2017,34,5,


## so we can see that it removed all the extra a-z part(mph,F,C) , which was associated with the values but, it also erased the entire event column, coz in that column all the values are in string form, so to avoid that lets be specific

In [34]:
new_df = df.replace({
        'temperature':'[A-Za-z]',        #specifying exact column in which we want to do the regex
        'windspeed':'[A-Za-z]'              #rest are same
},'',regex=True)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,01-01-2017,32,6,Rain
1,01-02-2017,-99999,7,Sunny
2,01-03-2017,28,-99999,Snow
3,01-04-2017,-99999,7,no event
4,01-05-2017,32,-99999,Rain
5,01-06-2017,31,2,Sunny
6,01-06-2017,34,5,no event


## lets see, how we can replace a list of values with another list of values
## lets create a new dataframe


In [35]:
df1 = pd.DataFrame({'score':['exceptional', 'average', 'good', 'poor', 'average', 'exceptional'],
                   'student':['rob', 'maya', 'parthiv', 'tom', 'julian', 'erica']})
df1

Unnamed: 0,score,student
0,exceptional,rob
1,average,maya
2,good,parthiv
3,poor,tom
4,average,julian
5,exceptional,erica


## so now we want to replace the score, into numbers, like poor means '0'..and so on.....

In [37]:
new_df1 = df1.replace(['poor', 'average','good', 'exceptional'],[1,2,3,4])    #so here we replaced the list values with numbers
new_df1                                                                     #in a order, sequentially

Unnamed: 0,score,student
0,4,rob
1,2,maya
2,3,parthiv
3,1,tom
4,2,julian
5,4,erica
