## Series
A one-dimensional labeled array capable of holding any data type 

In [1]:
#import pandas
import pandas as pd
#import numpy
import numpy as np

In [2]:
s = pd.Series(['india','pakistan','china'], index=['a','b','c'])
s

a       india
b    pakistan
c       china
dtype: object

## DataFrame

A two-dimensional labeled data structure with columns of potentially different types

### Creating a Dataframe with dictionary data

In [3]:
#Creating a dictionary 'data' with strings as keys and lists as value
data_from_dic = {
        'Country': ['Belgium',  'India',  'Brazil'],
        'Capital': ['Brussels',  'New Delhi',  'Brasilia'],
        'Population': [11190846, 1303171035, 207847528]
        }
df_from_dic = pd.DataFrame(data_from_dic)
df_from_dic


Unnamed: 0,Capital,Country,Population
0,Brussels,Belgium,11190846
1,New Delhi,India,1303171035
2,Brasilia,Brazil,207847528


In [4]:
## Dataframe with duplicated entries
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012,2012, 2012],
        'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions', 'Lions', 'Lions','Lions',''],
        'wins': [11, 8, 10, 15, 11, 6, 10, 4,4, 0],
        'losses': [5, 8, 6, 1, 5, 10, 6, 12,12, 0]}
football = pd.DataFrame(data, columns=['year', 'team', 'wins', 'losses'])
football

Unnamed: 0,year,team,wins,losses
0,2010,Bears,11,5
1,2011,Bears,8,8
2,2012,Bears,10,6
3,2011,Packers,15,1
4,2012,Packers,11,5
5,2010,Lions,6,10
6,2011,Lions,10,6
7,2012,Lions,4,12
8,2012,Lions,4,12
9,2012,,0,0


In [5]:
football[football.wins == 0 ]

Unnamed: 0,year,team,wins,losses
9,2012,,0,0


In [6]:
#Find duplicated entries
#DataFrame.duplicated(subset=None, keep='first')
football.duplicated(['team','year'], keep='first')

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8     True
9    False
dtype: bool

In [7]:
football[football.duplicated(['team','year'], keep='first')]

Unnamed: 0,year,team,wins,losses
8,2012,Lions,4,12


In [8]:
# Drop duplicates
#DataFrame.drop_duplicates(subset=None, keep='first', inplace=False)
football.drop_duplicates(['year', 'team'], keep='first')

Unnamed: 0,year,team,wins,losses
0,2010,Bears,11,5
1,2011,Bears,8,8
2,2012,Bears,10,6
3,2011,Packers,15,1
4,2012,Packers,11,5
5,2010,Lions,6,10
6,2011,Lions,10,6
7,2012,Lions,4,12
9,2012,,0,0


### Creating a Dataframe with list

In [9]:
data_from_list = [
    ['Belgium',  'India',  'Brazil'],
    ['Brussels',  'New Delhi',  'Brasilia'],
    [11190846, 1303171035, 207847528]
]
df_from_list = pd.DataFrame(data_from_list, columns=['Country',  'Capital',  'Population'], index=[1,2,3])
df_from_list


Unnamed: 0,Country,Capital,Population
1,Belgium,India,Brazil
2,Brussels,New Delhi,Brasilia
3,11190846,1303171035,207847528


## Reading a CSV

In [10]:
car_dataset_url ='https://raw.githubusercontent.com/ankitind/sample_datasets/master/car_ad.csv'
car_ads = pd.read_csv(car_dataset_url, header=0)
car_ads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9576 entries, 0 to 9575
Data columns (total 10 columns):
car             9576 non-null object
price           9576 non-null float64
body            9576 non-null object
mileage         9576 non-null int64
engV            9142 non-null float64
engType         9576 non-null object
registration    9576 non-null object
year            9576 non-null int64
model           9576 non-null object
drive           9065 non-null object
dtypes: float64(2), int64(2), object(6)
memory usage: 748.2+ KB


### Reading a Dataframe using indexes and loc

In [11]:
print(car_ads['body'][1])
print(car_ads.body[1])
print(car_ads.loc[1, 'body'])
print(car_ads.iloc[1, 2])
print("--")
print(car_ads.ix[1:1, 2:3])

sedan
sedan
sedan
sedan
--
    body
1  sedan


## Slicing  Columns and Rows

### Seleting specific columns by passing them as a list

In [12]:
car_ads[['car', 'year','body']].head(4)

Unnamed: 0,car,year,body
0,Ford,2010,crossover
1,Mercedes-Benz,2011,sedan
2,Mercedes-Benz,2008,other
3,Mercedes-Benz,2012,van


### Seleting specific rows by doing a boolean operation

In [13]:
car_ads[car_ads.mileage>900][0:3].head(4)

Unnamed: 0,car,price,body,mileage,engV,engType,registration,year,model,drive
334,Toyota,36000.0,crossover,940,,Other,yes,2009,Land Cruiser 200,
2602,Mercedes-Benz,68700.0,crossover,980,3.0,Diesel,yes,2013,GL 350,full
4572,Kia,7150.0,hatch,920,1.4,Petrol,yes,2008,Ceed,front


In [14]:
#or
car_ads[(car_ads.mileage>500) & (car_ads.price>10000)].head(7)

Unnamed: 0,car,price,body,mileage,engV,engType,registration,year,model,drive
334,Toyota,36000.0,crossover,940,,Other,yes,2009,Land Cruiser 200,
666,BMW,30500.0,sedan,630,,Petrol,yes,2010,523,rear
667,BMW,29500.0,sedan,850,,Petrol,yes,2011,523,rear
668,Nissan,14500.0,crossover,740,2.0,Petrol,yes,2008,X-Trail,full
1524,Skoda,14900.0,hatch,710,1.8,Petrol,yes,2012,Octavia A5,front
1676,Bentley,117000.0,sedan,830,6.0,Petrol,yes,2011,Flying Spur,full
1889,Volkswagen,20500.0,hatch,800,2.0,Diesel,yes,2013,Golf VII,front


In [15]:
car_ads.ix[0:20,'body':'engV'].head()

Unnamed: 0,body,mileage,engV
0,crossover,68,2.5
1,sedan,173,1.8
2,other,135,5.5
3,van,162,1.8
4,vagon,91,


In [16]:
#Slicing by providing list 
car_ads.ix[0:20,['body','engV']].head()

Unnamed: 0,body,engV
0,crossover,2.5
1,sedan,1.8
2,other,5.5
3,van,1.8
4,vagon,


In [17]:
car_ads.ix[car_ads.mileage>900, 'body':'engV'].head(4)

Unnamed: 0,body,mileage,engV
334,crossover,940,
2602,crossover,980,3.0
4572,hatch,920,1.4
6857,sedan,999,1.3


### Series vs 1-column Dataframe Slicing

In [18]:
#series  (by providing string)
car_ads['body'].head()

0    crossover
1        sedan
2        other
3          van
4        vagon
Name: body, dtype: object

In [19]:
#1-Column Dataframe (by providing list)
car_ads[['body']].head()

Unnamed: 0,body
0,crossover
1,sedan
2,other
3,van
4,vagon


In [20]:
#2-Column Dataframe (by providing list)
car_ads[['body', 'price']].head()

Unnamed: 0,body,price
0,crossover,15500.0
1,sedan,20500.0
2,other,35000.0
3,van,17800.0
4,vagon,33000.0


In [21]:
#2-Column Dataframe (by providing list)
car_ads.loc[1:2,['body', 'price']]

Unnamed: 0,body,price
1,sedan,20500.0
2,other,35000.0


### Slicing in Reverse order

Slice the row labels 4 to 1 in reverse order. 
To do this for hypothetical row labels 'a' and 'b', you could use a stepsize of -1 like so: df.loc['b':'a':-1].m

In [22]:
car_ads.loc[9:1:-1,['body', 'price']]

Unnamed: 0,body,price
9,sedan,22700.0
8,sedan,21500.0
7,vagon,10500.0
6,sedan,6500.0
5,crossover,16600.0
4,vagon,33000.0
3,van,17800.0
2,other,35000.0
1,sedan,20500.0


### Filtering with multiple Boolean Operators

In [23]:
car_ads[(car_ads.price > 200000) & (car_ads.year > 2014) & (car_ads.body == 'crossover')]


Unnamed: 0,car,price,body,mileage,engV,engType,registration,year,model,drive
1611,Bentley,499999.0,crossover,0,6.0,Petrol,yes,2016,Bentayga,full
4134,Bentley,449999.0,crossover,0,6.0,Petrol,yes,2016,Bentayga,full
5074,Mercedes-Benz,222000.0,crossover,0,6.3,Petrol,yes,2016,S 63 AMG,full
7914,Bentley,499999.0,crossover,0,6.0,Petrol,yes,2016,Bentayga,full
8205,Land Rover,285000.0,crossover,0,5.0,Petrol,yes,2016,Range Rover,full


### Apply functions

In [24]:
def capitalfuc(x):
    return x.capitalize() 

car_ads_lower = car_ads['car'].apply(capitalfuc)
car_ads_lower.head(7)
#print(upperfuc("asd"))

0             Ford
1    Mercedes-benz
2    Mercedes-benz
3    Mercedes-benz
4    Mercedes-benz
5           Nissan
6            Honda
Name: car, dtype: object

#### Using Lambda function

In [25]:
car_ads_upper = car_ads['car'].apply(lambda x: x.upper())
car_ads_upper.head(7)
#print(upperfuc("asd"))

0             FORD
1    MERCEDES-BENZ
2    MERCEDES-BENZ
3    MERCEDES-BENZ
4    MERCEDES-BENZ
5           NISSAN
6            HONDA
Name: car, dtype: object

### Seelcting columns based on data types

In [26]:
car_ads_object = car_ads.select_dtypes(include=[object])
car_ads_object.head(7) 

Unnamed: 0,car,body,engType,registration,model,drive
0,Ford,crossover,Gas,yes,Kuga,full
1,Mercedes-Benz,sedan,Gas,yes,E-Class,rear
2,Mercedes-Benz,other,Petrol,yes,CL 550,rear
3,Mercedes-Benz,van,Diesel,yes,B 180,front
4,Mercedes-Benz,vagon,Other,yes,E-Class,
5,Nissan,crossover,Petrol,yes,X-Trail,full
6,Honda,sedan,Petrol,yes,Accord,front


In [27]:
car_ads_int = car_ads.select_dtypes(include=[int])
car_ads_int.head(7) 

Unnamed: 0,mileage,year
0,68,2010
1,173,2011
2,135,2008
3,162,2012
4,91,2013
5,83,2013
6,199,2003


In [28]:
car_ads_float = car_ads.select_dtypes(include=[float])
car_ads_float.head(7) 

Unnamed: 0,price,engV
0,15500.0,2.5
1,20500.0,1.8
2,35000.0,5.5
3,17800.0,1.8
4,33000.0,
5,16600.0,2.0
6,6500.0,2.0


In [29]:
car_ads_ex_float = car_ads.select_dtypes(exclude=[float])
car_ads_ex_float.head(7) 

Unnamed: 0,car,body,mileage,engType,registration,year,model,drive
0,Ford,crossover,68,Gas,yes,2010,Kuga,full
1,Mercedes-Benz,sedan,173,Gas,yes,2011,E-Class,rear
2,Mercedes-Benz,other,135,Petrol,yes,2008,CL 550,rear
3,Mercedes-Benz,van,162,Diesel,yes,2012,B 180,front
4,Mercedes-Benz,vagon,91,Other,yes,2013,E-Class,
5,Nissan,crossover,83,Petrol,yes,2013,X-Trail,full
6,Honda,sedan,199,Petrol,yes,2003,Accord,front


In [30]:
car_ads_ex_number = car_ads.select_dtypes(exclude=[np.number])
car_ads_ex_number.head(7) 

Unnamed: 0,car,body,engType,registration,model,drive
0,Ford,crossover,Gas,yes,Kuga,full
1,Mercedes-Benz,sedan,Gas,yes,E-Class,rear
2,Mercedes-Benz,other,Petrol,yes,CL 550,rear
3,Mercedes-Benz,van,Diesel,yes,B 180,front
4,Mercedes-Benz,vagon,Other,yes,E-Class,
5,Nissan,crossover,Petrol,yes,X-Trail,full
6,Honda,sedan,Petrol,yes,Accord,front


## Melting dataframes

In [45]:
a = {'country':'india', 
     'capital':'delhi', 
     'currency':'INR',
     'GDP' : 7860
    }
b = {'country':'pakistan', 'capital':'islamabad', 'currency': 'PKR',
     'GDP' : 3860}
c = {'country':'sri lanka', 'capital':'colombo', 'currency':'lankan rupee',
     'GDP' : 456}
df = pd.DataFrame([a,b,c], columns=['country', 'capital', 'currency', 'GDP'])
df

Unnamed: 0,country,capital,currency,GDP
0,india,delhi,INR,7860
1,pakistan,islamabad,PKR,3860
2,sri lanka,colombo,lankan rupee,456


In [102]:
df_pivot = df.pivot(index="country", columns="capital", values="GDP")
df_pivot.head(10)

capital,colombo,delhi,islamabad
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
india,,7860.0,
pakistan,,,3860.0
sri lanka,456.0,,


In [59]:
df_index = df.set_index(['country', 'capital'])
df_index.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,currency,GDP
country,capital,Unnamed: 2_level_1,Unnamed: 3_level_1
india,delhi,INR,7860
pakistan,islamabad,PKR,3860
sri lanka,colombo,lankan rupee,456


In [72]:
#Resetting the Index
df_no_i = df_index.reset_index()
df_no_i.head()

Unnamed: 0,country,capital,currency,GDP
0,india,delhi,INR,7860
1,pakistan,islamabad,PKR,3860
2,sri lanka,colombo,lankan rupee,456


In [62]:
#Melting
pd.melt(df)

Unnamed: 0,variable,value
0,country,india
1,country,pakistan
2,country,sri lanka
3,capital,delhi
4,capital,islamabad
5,capital,colombo
6,currency,INR
7,currency,PKR
8,currency,lankan rupee
9,GDP,7860


In [105]:
dup_rows = pd.melt(df, 
        id_vars=['country', 'capital'], 
        value_vars=['currency', 'GDP'], 
        var_name = 'CurrencyGDP', 
        value_name = 'Values')
dup_rows.head()

Unnamed: 0,country,capital,CurrencyGDP,Values
0,india,delhi,currency,INR
1,pakistan,islamabad,currency,PKR
2,sri lanka,colombo,currency,lankan rupee
3,india,delhi,GDP,7860
4,pakistan,islamabad,GDP,3860


In [110]:
#Pivot Table
new_pivot = dup_rows.pivot_table(index="country", columns="capital", values="Values", aggfunc='count')
new_pivot.head(10)

capital,colombo,delhi,islamabad
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
india,,2.0,
pakistan,,,2.0
sri lanka,2.0,,


In [99]:

df_indexn = df_index.unstack(level=1)
print(df_indexn.head())

#col_level : # Obtain the key-value pairs: kv_pairs
kv_pairs = pd.melt(df_indexn,  col_level=0)
kv_pairs.head(20)



               currency                     GDP                  
capital         colombo delhi islamabad colombo   delhi islamabad
country                                                          
india              None   INR      None     NaN  7860.0       NaN
pakistan           None  None       PKR     NaN     NaN    3860.0
sri lanka  lankan rupee  None      None   456.0     NaN       NaN


Unnamed: 0,variable,value
0,currency,
1,currency,
2,currency,lankan rupee
3,currency,INR
4,currency,
5,currency,
6,currency,
7,currency,PKR
8,currency,
9,GDP,


In [98]:

kv_pairs = pd.melt(df_indexn,  col_level=1)
kv_pairs.head(20)


Unnamed: 0,capital,value
0,colombo,
1,colombo,
2,colombo,lankan rupee
3,delhi,INR
4,delhi,
5,delhi,
6,islamabad,
7,islamabad,PKR
8,islamabad,
9,colombo,


## Groupby and Categoricalsm

In [134]:
sales = pd.DataFrame(
    {
        'weekday': ['Sun', 'Mon', 'Tue', 'Wed', 'Thr', 'Fri', 'Sat', 
                    'Sun', 'Mon', 'Tue', 'Wed', 'Thr', 'Fri', 'Sat', 
                    'Sun', 'Sun'],
        'city' :['Agra', 'Pune', 'Madras', 'Calcutta', 'Mumbai', 'Noida', 'Noida', 
                 'Delhi', 'Agra', 'Pune', 'Madras', 'Calcutta', 'Mumbai', 'Noida', 
                 'Noida', 'Delhi'],
        'bread' :[345, 555, 222, 123, 231, 189, 49,
                  89, 129, 113, 119, 213, 110, 90,
                 48, 87],
        'butter' :[235, 35, 32, 33, 41, 89, 39,
                  69, 49, 43, 119, 113, 113, 27,
                 48, 85]
        
        
    })
customers = pd.Series(['Sandeep', 'Urmil', 'Neha', 'Rahul', 'Amar', 'Rakesh', 'Nandi', 
                 'Aman', 'Sagar', 'Pratap', 'Raghu', 'Joy', 'Waqar', 'Roshan', 
                 'Ranbir', 'Sumit'])

In [114]:
sales.loc[sales.weekday=='Sun'].count()


bread      4
butter     4
city       4
weekday    4
dtype: int64

In [127]:
print(sales.groupby('city').count())
print("-------------------")
print(sales.groupby(by=['weekday','city']).mean())

          bread  butter  weekday
city                            
Agra          2       2        2
Calcutta      2       2        2
Delhi         2       2        2
Madras        2       2        2
Mumbai        2       2        2
Noida         4       4        4
Pune          2       2        2
-------------------
                  bread  butter
weekday city                   
Fri     Mumbai    110.0   113.0
        Noida     189.0    89.0
Mon     Agra      129.0    49.0
        Pune      555.0    35.0
Sat     Noida      69.5    33.0
Sun     Agra      345.0   235.0
        Delhi      88.0    77.0
        Noida      48.0    48.0
Thr     Calcutta  213.0   113.0
        Mumbai    231.0    41.0
Tue     Madras    222.0    32.0
        Pune      113.0    43.0
Wed     Calcutta  123.0    33.0
        Madras    119.0   119.0


In [133]:
sales.groupby(['weekday','city'])['bread', 'butter'].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,bread,butter
weekday,city,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,Mumbai,110,113
Fri,Noida,189,89
Mon,Agra,129,49
Mon,Pune,555,35
Sat,Noida,139,66
Sun,Agra,345,235
Sun,Delhi,176,154
Sun,Noida,48,48
Thr,Calcutta,213,113
Thr,Mumbai,231,41


In [137]:
sales.groupby(customers)['bread'].sum()

Aman        89
Amar       231
Joy        213
Nandi       49
Neha       222
Pratap     113
Raghu      119
Rahul      123
Rakesh     189
Ranbir      48
Roshan      90
Sagar      129
Sandeep    345
Sumit       87
Urmil      555
Waqar      110
Name: bread, dtype: int64

In [139]:
sales['city'].unique()

array(['Agra', 'Pune', 'Madras', 'Calcutta', 'Mumbai', 'Noida', 'Delhi'], dtype=object)

In [140]:
sales['city'].value_counts()

Noida       4
Pune        2
Mumbai      2
Delhi       2
Calcutta    2
Madras      2
Agra        2
Name: city, dtype: int64

In [142]:
sales['city'] = sales['city'].astype('category')
sales['city']

0         Agra
1         Pune
2       Madras
3     Calcutta
4       Mumbai
5        Noida
6        Noida
7        Delhi
8         Agra
9         Pune
10      Madras
11    Calcutta
12      Mumbai
13       Noida
14       Noida
15       Delhi
Name: city, dtype: category
Categories (7, object): [Agra, Calcutta, Delhi, Madras, Mumbai, Noida, Pune]

## Map Function
The .map() method is used to transform values according to a Python dictionary look-up. 

In [42]:
def multiply(x):
    x = x*2
    
car_ads['new'] = map(multiply, car_ads['price'])
car_ads.head()


car_ads['new2'] = car_ads.applymap(multiply, car_ads['price'])

TypeError: applymap() takes 2 positional arguments but 3 were given

In [None]:
car_ads.loc[0:20,['new']]