# Concepts of Cleaning 

# Merge

In [13]:
import pandas as pd

In [2]:
left_frame = pd.DataFrame({'key': range(5), 
                           'left_value': ['a', 'b', 'c', 'd', 'e']})
right_frame = pd.DataFrame({'key': range(2, 7), 
                           'right_value': ['f', 'g', 'h', 'i', 'j']})
print(left_frame)
print('\n')
print(right_frame)

   key left_value
0    0          a
1    1          b
2    2          c
3    3          d
4    4          e


   key right_value
0    2           f
1    3           g
2    4           h
3    5           i
4    6           j


# Inner Join

In [3]:
pd.merge(left_frame, right_frame, on='key', how='inner')

Unnamed: 0,key,left_value,right_value
0,2,c,f
1,3,d,g
2,4,e,h


# Left Join

In [4]:
pd.merge(left_frame, right_frame, on='key', how='left')

Unnamed: 0,key,left_value,right_value
0,0,a,
1,1,b,
2,2,c,f
3,3,d,g
4,4,e,h


# Right Join

In [5]:
pd.merge(left_frame, right_frame, on='key', how='right')

Unnamed: 0,key,left_value,right_value
0,2,c,f
1,3,d,g
2,4,e,h
3,5,,i
4,6,,j


# Outer Join

In [6]:
pd.merge(left_frame, right_frame, on='key', how='outer')

Unnamed: 0,key,left_value,right_value
0,0,a,
1,1,b,
2,2,c,f
3,3,d,g
4,4,e,h
5,5,,i
6,6,,j


# Concatenate

In [7]:
pd.concat([left_frame, right_frame])

Unnamed: 0,key,left_value,right_value
0,0,a,
1,1,b,
2,2,c,
3,3,d,
4,4,e,
0,2,,f
1,3,,g
2,4,,h
3,5,,i
4,6,,j


In [8]:
pd.concat([left_frame, right_frame], axis=1)

Unnamed: 0,key,left_value,key.1,right_value
0,0,a,2,f
1,1,b,3,g
2,2,c,4,h
3,3,d,5,i
4,4,e,6,j


# Duplicating 

In [9]:
df = pd.read_csv("Humidity1.csv")
df

FileNotFoundError: File b'Humidity1.csv' does not exist

# Lets check which rows are duplicated

In [None]:
df.duplicated()

# Lets check for the duplication in certain columns

In [None]:
df.duplicated(['date'])

# WAP - In class exe : Check for the duplication in the column "city"?

# Lets check for the duplication in two columns together?

In [None]:
df.duplicated(['date','city'])

In [None]:
df.duplicated(['date','city'], keep='last')

# WAP - In class exe : Try using keep ='first' and see what happens?

# Lets drop duplicates by considering city

In [None]:
df.drop_duplicates(['city'])

# In class lab exercise:
Drop the duplicates considering both date and city?

# Mapping 

In [10]:
Season = {'new york':'winter', 'mumbai':'summer'}

In [11]:
df['Season']=df['city'].map(Season)
df

NameError: name 'df' is not defined

# Replacing

In [None]:
df['city'].replace('new york','Italy')

# In class lab exercise:
Replace the date value 05-02-2017 with 10-11-2018?

# Renaming 

# Lets rename the column 'temperature' with 'temp'

In [None]:
df.rename(columns= {'temperature':'temp'})

# In class lab exercise:
 Rename the column 'city' with city_name?

# Summary Statistics

In [16]:
import pandas as pd 
import numpy as np
import csv
data = pd.read_csv("wine.csv", encoding="latin-1")
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",VulkÃ Bianco,87,,Sicily & Sardinia,Etna,,Kerin OâKeefe,@kerinokeefe,Nicosia 2013 VulkÃ Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwineÂ,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwineÂ,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


# Lets have a brief look at the first four rows of the data in table

In [None]:
data.rename(columns = {'Unnamed: 0':'serials'})

# Head function in python with arguments

In [None]:
data.head(8)

# In class lab exercise:
Get the first 15 rows from the data ?

In [None]:
data.tail()

# Tail function in python with arguments

In [None]:
data.tail(8)

# In class lab exercise: 
Get the last 15 rows from the data?

# describe() Function

# Lets get the Summary Statistic of the numeric columns:

In [17]:
data.describe()

Unnamed: 0.1,Unnamed: 0,points,price
count,129971.0,129971.0,120975.0
mean,64985.0,88.447138,35.363389
std,37519.540256,3.03973,41.022218
min,0.0,80.0,4.0
25%,32492.5,86.0,17.0
50%,64985.0,88.0,25.0
75%,97477.5,91.0,42.0
max,129970.0,100.0,3300.0


# Lets get the Summary Statistic of all the columns

In [18]:
data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
count,129971.0,129908,129971,92506,129971.0,120975.0,129908,108724,50511,103727,98758,129971,129970,129971
unique,,43,119955,37979,,,425,1229,17,19,15,118840,707,16757
top,,US,"Gravenstein apple, honeysuckle and jasmine aro...",Reserve,,,California,Napa Valley,Central Coast,Roger Voss,@vossroger,Gloria Ferrer NV Sonoma Brut Sparkling (Sonoma...,Pinot Noir,Wines & Winemakers
freq,,54504,3,2009,,,36247,4480,11065,25514,25514,11,13272,222
mean,64985.0,,,,88.447138,35.363389,,,,,,,,
std,37519.540256,,,,3.03973,41.022218,,,,,,,,
min,0.0,,,,80.0,4.0,,,,,,,,
25%,32492.5,,,,86.0,17.0,,,,,,,,
50%,64985.0,,,,88.0,25.0,,,,,,,,
75%,97477.5,,,,91.0,42.0,,,,,,,,


# Lets get the basic information about our data

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
Unnamed: 0               129971 non-null int64
country                  129908 non-null object
description              129971 non-null object
designation              92506 non-null object
points                   129971 non-null int64
price                    120975 non-null float64
province                 129908 non-null object
region_1                 108724 non-null object
region_2                 50511 non-null object
taster_name              103727 non-null object
taster_twitter_handle    98758 non-null object
title                    129971 non-null object
variety                  129970 non-null object
winery                   129971 non-null object
dtypes: float64(1), int64(2), object(11)
memory usage: 8.4+ MB


# Take home assignment  : 
Perform the summary statistics, and concepts of cleaning in the given data set.
    Data set Given - Class grades
