# if a column has more than 75% of missing values or na values we can delete or remove that column from the dataset

In [1]:
import pandas as pd
df = pd.read_csv("weather_data.csv", parse_dates=['day'])
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


### parse_dates=['day'] is used to convert the day column in to dates format

In [2]:
type(df.day[0])   #type of the particular 0th row of the 'day' column

pandas._libs.tslibs.timestamps.Timestamp

In [3]:
df.temperature[0]   #printing the 0th row of the temperature column

32.0

In [4]:
df.windspeed[1]    #printing the 1st row of the 'windspeed' column

9.0

In [5]:
df.event[2]      #printing the 2nd row of the 'event' column

'Snow'

### if we want to set the day column as index then we can do by this step

In [6]:
df.set_index('day',inplace=True)   #inplace=True written to make changes to the original dataset
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### if we want to fill na values to 0, without changing the original dataset,we can create a new df and do the operation there

In [8]:
new_df = df.fillna(0)   #use of fillna() function--fill all the 'na' with '0'
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,0
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,0
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### what it did?? filled all the na to 0 but also changed the na of event column into 0, which is weird,so we can do this to give specific values to that particular column to change it according to our way

In [12]:
new_df = df.fillna({
        'temperature': 0,
        'windspeed': 0,
        'event': 'no event'
})
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,no event
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,no event
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### but still the dataset is not looking good as it contains '0' which means nothing, lets deal with it...
### fill it  by taking the values from previous row or below row 

In [14]:
new_df = df.fillna(method='ffill')   #method is ffill for forward carrying and also we can use bfill for back fill values
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### we can also fill the values by column wise like if we set axis to 1` or axis = column

In [16]:
new_df = df.fillna(method='bfill',axis = 'columns')   #here we used bfill function and given axis to columns for columnwise
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,9,9,Sunny
2017-01-05,28,Snow,Snow
2017-01-06,7,7,
2017-01-07,32,Rain,Rain
2017-01-08,Sunny,Sunny,Sunny
2017-01-09,,,
2017-01-10,34,8,Cloudy
2017-01-11,40,12,Sunny


### so it backfilled all the values from the previous columns  to the empty spaces present

In [17]:
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,9,9,Sunny
2017-01-05,28,Snow,Snow
2017-01-06,7,7,
2017-01-07,32,Rain,Rain
2017-01-08,Sunny,Sunny,Sunny
2017-01-09,,,
2017-01-10,34,8,Cloudy
2017-01-11,40,12,Sunny


### we can also limit the filling mechanism to a specific step suppose we want to fill the previous data to only one column or one row, having empty spaces, so that the previous values will only copy to that part not further.

In [19]:
new_df = df.fillna(method='ffill', limit = 1) #we can also give limit =2 means it will forwardfill the misscalues to two steps
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


## so for more better guessing and filling the missing values, we have a method called interpolate

In [20]:
new_df = df.interpolate()        #use of function interpolate()
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### but there is a problem, date is not uniform , after 01-01, directly the date shows is 04-01..so sure, there is some problem present in the interpolation, to correct it lets add some

In [21]:
new_df = df.interpolate(method='time')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### so it realised that, the value should not be 30 and should be close to 28 coz the middle date is 04 ,next is 05,
### so the value interpolated too should be closer to 28, datsy fitted 29 in that part

### so if we want to drop some rows with 'na' values,-------->

In [23]:
new_df = df.dropna()      #use of function dropna()
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### so if we want to drop the rows which have only 'na' values present in it

In [24]:
new_df = df.dropna(how='all')   
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### so here we can see there is no 2017-01-09 is present,coz in that row, only na values were present

### so now if we wanna keep the row, which has atleast one NA value, so the rest rows,which doesnot have even one NA value, will be dropped

In [25]:
new_df = df.dropna(thresh=1)    #thresh=1 means we specify to keep the row which has atleast one NA  value present in it
new_df                         #similarly we can give thresh=2, for keeping row which has atleast two non-NA values in it

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### it kept the 6th jan 2017 value coz it has one non-NA value(7.0) present in it, and no 9th january row is present coz it has no NON-NA  value / not a single non-NA value present in it

## At last we will add the missing dates in the day column, lets do this

In [27]:
dt = pd.date_range('01-01-2017','01-11-2017')  #we gave the range from 1st jan to 11th jan
idx = pd.DatetimeIndex(dt)  #passing range(dt) into datetime index and created a datetime index
df = df.reindex(idx)  #reindexing in the dataframe using that idx(index)
df

Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,,,
2017-01-03,,,
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy


# THE END OF FIRST PART---->>>>>>TO BE CONTINUED