# Introduction to Pandas

In [1]:
import pandas as pd #this will import pandas into your workspace
import numpy as np 
from pandas import Series, DataFrame

# Data Structures in pandas

There are two basic data structures in pandas: Series and DataFrame

# Series

# An example of  first python series¶

In [2]:
series1 = pd.Series([10,20,30,40]) #we have used a list to create a series.
print(series1)

0    10
1    20
2    30
3    40
dtype: int64


# Printing the values that are there in the series

In [3]:
series1.values

array([10, 20, 30, 40], dtype=int64)

# Figuring out the index numbers that are there in the series

In [4]:
series1.index #This would print the starting index and ending index

RangeIndex(start=0, stop=4, step=1)

# Specifying custom index values rather than the default ones provided, you can do so using the following command

In [5]:
series2 = pd.Series([10,20,30,40,50], index=['one','two','three','four','five'])
series2

one      10
two      20
three    30
four     40
five     50
dtype: int64

In [6]:
# Lets print the element which is there in the position 2

In [7]:
series2[2]

30

# Lets retrive the element using index number

In [8]:
series2 = pd.Series([10,20,30,40,50], index=['one','two','three','four','five'])
print(series2)
series2['three']

one      10
two      20
three    30
four     40
five     50
dtype: int64


30

# Lets access multiple elements

In [9]:
series2[['one', 'three', 'five']]

one      10
three    30
five     50
dtype: int64

# Lets add "4" to each element of the series (math operations)

In [10]:
series2 + 4

one      14
two      24
three    34
four     44
five     54
dtype: int64

# Lets subset the entire series whose value is greater than 30

In [11]:
series2[series2>30]

four    40
five    50
dtype: int64

# In class lab exercise: 
Add 5 to each element of the series and subset the entire series whose value is greater than 40?

# Data Frame

# Lets create a Data Frame with multiple columns called Price, Ticker and Company.

In [12]:
data = DataFrame({'price':[95,25,85,41,78],'ticker':['AXP','CSCO','DIS','MSFT','WMT'],
                  'company':['American Express','Cisco','Walt Disney','Microsoft','Walmart']})

In [13]:
data

Unnamed: 0,company,price,ticker
0,American Express,95,AXP
1,Cisco,25,CSCO
2,Walt Disney,85,DIS
3,Microsoft,41,MSFT
4,Walmart,78,WMT


# How to access a specefic column from the data frame?

In [14]:
data['company']

0    American Express
1               Cisco
2         Walt Disney
3           Microsoft
4             Walmart
Name: company, dtype: object

# How to access a specefic row from the data frame?

In [15]:
data.ix[2] #Will print all the elements of second row 

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.


company    Walt Disney
price               85
ticker             DIS
Name: 2, dtype: object

# How to add a new column in the data frame?

In [16]:
data['Year'] = 2014
data

Unnamed: 0,company,price,ticker,Year
0,American Express,95,AXP,2014
1,Cisco,25,CSCO,2014
2,Walt Disney,85,DIS,2014
3,Microsoft,41,MSFT,2014
4,Walmart,78,WMT,2014


#  In class lab exercise: 
Create a new column "prices_discount" in the above data frame where the value should be 10% discounted from the original column "price". This change must be implmented in the same data frame¶

# How to create a column and populate it with missing values(NaN) ?

In [17]:
data['delta_col'] = np.nan
data

Unnamed: 0,company,price,ticker,Year,delta_col
0,American Express,95,AXP,2014,
1,Cisco,25,CSCO,2014,
2,Walt Disney,85,DIS,2014,
3,Microsoft,41,MSFT,2014,
4,Walmart,78,WMT,2014,


# How to delete a column

In [18]:
# del data['name_of_the_col_to_delete']

del data['delta_col']
print(data)

            company  price ticker  Year
0  American Express     95    AXP  2014
1             Cisco     25   CSCO  2014
2       Walt Disney     85    DIS  2014
3         Microsoft     41   MSFT  2014
4           Walmart     78    WMT  2014


# How to drop a column?

In [19]:
newdata = data.drop(2)
print(newdata)

            company  price ticker  Year
0  American Express     95    AXP  2014
1             Cisco     25   CSCO  2014
3         Microsoft     41   MSFT  2014
4           Walmart     78    WMT  2014


# How to do a transpose of a dataframe?

In [20]:
dft = data.T #Transpose operation will interchange the rows and columns
dft

Unnamed: 0,0,1,2,3,4
company,American Express,Cisco,Walt Disney,Microsoft,Walmart
price,95,25,85,41,78
ticker,AXP,CSCO,DIS,MSFT,WMT
Year,2014,2014,2014,2014,2014


# Indexing

In [21]:
df1= data.ix[:,'price']
df1

0    95
1    25
2    85
3    41
4    78
Name: price, dtype: int64

In [22]:
#based on row and column
data.ix[3,2]

'MSFT'

# Indexing with iloc:

In [23]:
# select first 2 rows
data.iloc[:2]
# or
data.iloc[:2,]

Unnamed: 0,company,price,ticker,Year
0,American Express,95,AXP,2014
1,Cisco,25,CSCO,2014


In [24]:
# select 3rd to 5th rows
data.iloc[2:5]
# or 
data.iloc[2:5,]

Unnamed: 0,company,price,ticker,Year
2,Walt Disney,85,DIS,2014
3,Microsoft,41,MSFT,2014
4,Walmart,78,WMT,2014


# In class lab exercise:
Select all the rows starting from third row using iloc()

# Select column by using column number in pandas with .iloc

In [25]:
# select first 2 columns
 
data.iloc[:,:2]

Unnamed: 0,company,price
0,American Express,95
1,Cisco,25
2,Walt Disney,85
3,Microsoft,41
4,Walmart,78


In [26]:
# select 1st and 4thcolumn
data.iloc[:,[0,3]]

Unnamed: 0,company,Year
0,American Express,2014
1,Cisco,2014
2,Walt Disney,2014
3,Microsoft,2014
4,Walmart,2014


# In class lab exercise:
Select 2nd row and 3rd column value?

# indexing with loc :

In [27]:
# select row by now name
 
data.loc[1]

company    Cisco
price         25
ticker      CSCO
Year        2014
Name: 1, dtype: object

In [28]:
# select value by row label and column label using loc
 
data.loc[[1,2,3,4,5],['price','ticker']]

Unnamed: 0,price,ticker
1,25.0,CSCO
2,85.0,DIS
3,41.0,MSFT
4,78.0,WMT
5,,


# How to reindex the data?

In [29]:
new_data = data.reindex(index=[0,2], columns=['company', 'price'])
print(new_data)

            company  price
0  American Express     95
2       Walt Disney     85


# How to fill a missing value with some value in the data frame?

In [30]:
years = [90, 91, 92, 93, 94, 95]
f4 = {90:8, 91:9, 92:7, 93:8, 94:9, 95:11}
firm4 = pd.Series(f4,index=years)
f5 = {90:14,91:12, 92:9, 93:13, 94:5, 95:8}
firm5 = pd.Series(f5,index=years)
f6 = {90:8, 91: 9, 92:9,93:10, 94:12, 95: 13}
firm6 = pd.Series(f6,index=years)
df2 = pd.DataFrame(columns=['Firm1','Firm2','Firm3'],index=years)
df2.Firm1 = firm4
df2.Firm2 = firm5
df2.Firm3 = firm6
#df2['Firm1'] = firm4
df2

Unnamed: 0,Firm1,Firm2,Firm3
90,8,14,8
91,9,12,9
92,7,9,9
93,8,13,10
94,9,5,12
95,11,8,13


In [31]:
#Note: reindex with only row arguments i.e we want row 88, 89 etc from above df2
reindexdf2 = df2.reindex([88,89,90,91,92,93,94,95,96,97,98], fill_value=0)
reindexdf2

Unnamed: 0,Firm1,Firm2,Firm3
88,0,0,0
89,0,0,0
90,8,14,8
91,9,12,9
92,7,9,9
93,8,13,10
94,9,5,12
95,11,8,13
96,0,0,0
97,0,0,0


# How to fill the missing value with the previous value?

In [32]:
reindexdf3 = df2.reindex([88,89,90,91,92,93,94,95,96,97,98], method='ffill')
reindexdf3

Unnamed: 0,Firm1,Firm2,Firm3
88,,,
89,,,
90,8.0,14.0,8.0
91,9.0,12.0,9.0
92,7.0,9.0,9.0
93,8.0,13.0,10.0
94,9.0,5.0,12.0
95,11.0,8.0,13.0
96,11.0,8.0,13.0
97,11.0,8.0,13.0


# In class lab exercise:
Create a data frame with three columns named one, two and three and fill the values with random numbers?
Hint: Use numpy to create random numbers.

# Drop the duplicate row of a dataframe

In [33]:
import pandas as pd
import numpy as np
 
#Create a DataFrame
d = {
    'Name':['Alisa','Bobby','jodha','jack','raghu','Cathrine',
            'Alisa','Bobby','kumar','Alisa','Alex','Cathrine'],
    'Age':[26,24,23,22,23,24,26,24,22,23,24,24],
      
    'Score':[85,63,55,74,31,77,85,63,42,62,89,77]}
 
df = pd.DataFrame(d,columns=['Name','Age','Score'])
df

Unnamed: 0,Name,Age,Score
0,Alisa,26,85
1,Bobby,24,63
2,jodha,23,55
3,jack,22,74
4,raghu,23,31
5,Cathrine,24,77
6,Alisa,26,85
7,Bobby,24,63
8,kumar,22,42
9,Alisa,23,62


# Drop the duplicate rows:

In [34]:
df.drop_duplicates()

Unnamed: 0,Name,Age,Score
0,Alisa,26,85
1,Bobby,24,63
2,jodha,23,55
3,jack,22,74
4,raghu,23,31
5,Cathrine,24,77
8,kumar,22,42
9,Alisa,23,62
10,Alex,24,89


# Drop the duplicate by retaining last occurrence:

In [35]:
df.drop_duplicates(keep='last')

Unnamed: 0,Name,Age,Score
2,jodha,23,55
3,jack,22,74
4,raghu,23,31
6,Alisa,26,85
7,Bobby,24,63
8,kumar,22,42
9,Alisa,23,62
10,Alex,24,89
11,Cathrine,24,77


# Drop the duplicate by column:

In [36]:
df.drop_duplicates(['Name'], keep='last')

Unnamed: 0,Name,Age,Score
2,jodha,23,55
3,jack,22,74
4,raghu,23,31
7,Bobby,24,63
8,kumar,22,42
9,Alisa,23,62
10,Alex,24,89
11,Cathrine,24,77


# Simply drop a row or observation:

In [37]:
df.drop([1,2])

Unnamed: 0,Name,Age,Score
0,Alisa,26,85
3,jack,22,74
4,raghu,23,31
5,Cathrine,24,77
6,Alisa,26,85
7,Bobby,24,63
8,kumar,22,42
9,Alisa,23,62
10,Alex,24,89
11,Cathrine,24,77


# Drop a row or observation by condition:

In [38]:
df[df.Name != 'Alisa']

Unnamed: 0,Name,Age,Score
1,Bobby,24,63
2,jodha,23,55
3,jack,22,74
4,raghu,23,31
5,Cathrine,24,77
7,Bobby,24,63
8,kumar,22,42
10,Alex,24,89
11,Cathrine,24,77


# Drop a row or observation by index:

In [39]:
df.drop(df.index[2])

Unnamed: 0,Name,Age,Score
0,Alisa,26,85
1,Bobby,24,63
3,jack,22,74
4,raghu,23,31
5,Cathrine,24,77
6,Alisa,26,85
7,Bobby,24,63
8,kumar,22,42
9,Alisa,23,62
10,Alex,24,89


# Drop the row by position:

In [40]:
# Drop bottom 3 rows
df[:-3]

Unnamed: 0,Name,Age,Score
0,Alisa,26,85
1,Bobby,24,63
2,jodha,23,55
3,jack,22,74
4,raghu,23,31
5,Cathrine,24,77
6,Alisa,26,85
7,Bobby,24,63
8,kumar,22,42


# Drop a column by name:

In [41]:
# drop a column based on name
 
df.drop('Age',axis=1)

Unnamed: 0,Name,Score
0,Alisa,85
1,Bobby,63
2,jodha,55
3,jack,74
4,raghu,31
5,Cathrine,77
6,Alisa,85
7,Bobby,63
8,kumar,42
9,Alisa,62


# Drop a column based on column index:

In [42]:
# drop a column based on column index
 
df.drop(df.columns[2],axis=1)

Unnamed: 0,Name,Age
0,Alisa,26
1,Bobby,24
2,jodha,23
3,jack,22
4,raghu,23
5,Cathrine,24
6,Alisa,26
7,Bobby,24
8,kumar,22
9,Alisa,23


# Delete a column based on column name:

In [43]:
# delete a column
 
del df['Age']
df

Unnamed: 0,Name,Score
0,Alisa,85
1,Bobby,63
2,jodha,55
3,jack,74
4,raghu,31
5,Cathrine,77
6,Alisa,85
7,Bobby,63
8,kumar,42
9,Alisa,62


# sort a dataframe in python

In [44]:
import pandas as pd
import numpy as np
 
#Create a Dictionary of series
d = {'Name':pd.Series(['Alisa','Bobby','Cathrine','Madonna','Rocky','Sebastian','Jaqluine',
   'Rahul','David','Andrew','Ajay','Teresa']),
   'Age':pd.Series([26,27,25,24,31,27,25,33,42,32,51,47]),
   'Score':pd.Series([89,87,67,55,47,72,76,79,44,92,99,69])}
 
#Create a DataFrame
df2 = pd.DataFrame(d)
print (df2)

    Age       Name  Score
0    26      Alisa     89
1    27      Bobby     87
2    25   Cathrine     67
3    24    Madonna     55
4    31      Rocky     47
5    27  Sebastian     72
6    25   Jaqluine     76
7    33      Rahul     79
8    42      David     44
9    32     Andrew     92
10   51       Ajay     99
11   47     Teresa     69


# Sort the python pandas Dataframe by single column – Ascending order:

In [45]:
# sort the pandas dataframe by ascending value of single column
 
df2.sort_values(by='Score')

Unnamed: 0,Age,Name,Score
8,42,David,44
4,31,Rocky,47
3,24,Madonna,55
2,25,Cathrine,67
11,47,Teresa,69
5,27,Sebastian,72
6,25,Jaqluine,76
7,33,Rahul,79
1,27,Bobby,87
0,26,Alisa,89


# Sort a Dataframe in python pandas by single Column – descending order

In [46]:
df2.sort_values(by='Score',ascending=0)

Unnamed: 0,Age,Name,Score
10,51,Ajay,99
9,32,Andrew,92
0,26,Alisa,89
1,27,Bobby,87
7,33,Rahul,79
6,25,Jaqluine,76
5,27,Sebastian,72
11,47,Teresa,69
2,25,Cathrine,67
3,24,Madonna,55


# In-class lab execise: Sort the column " Age"?

# sort the dataframe in python pandas by index in ascending order:

In [47]:
df3=df2.sort_index()
df3

Unnamed: 0,Age,Name,Score
0,26,Alisa,89
1,27,Bobby,87
2,25,Cathrine,67
3,24,Madonna,55
4,31,Rocky,47
5,27,Sebastian,72
6,25,Jaqluine,76
7,33,Rahul,79
8,42,David,44
9,32,Andrew,92


# In-class lab exercise: 
Sorting pandas dataframe by index in descending order?

# Rank the dataframe in python

In [48]:
import pandas as pd
import numpy as np
 
#Create a DataFrame
d = {
'Name':['Alisa','Bobby','Cathrine','Alisa','Bobby','Cathrine',
'Alisa','Bobby','Cathrine','Alisa','Bobby','Cathrine'],
'Subject':['Mathematics','Mathematics','Mathematics','Science','Science','Science',
'History','History','History','Economics','Economics','Economics'],
'Score':[62,47,55,74,31,77,85,63,42,62,89,85]}
 
df4 = pd.DataFrame(d,columns=['Name','Subject','Score'])
df4

Unnamed: 0,Name,Subject,Score
0,Alisa,Mathematics,62
1,Bobby,Mathematics,47
2,Cathrine,Mathematics,55
3,Alisa,Science,74
4,Bobby,Science,31
5,Cathrine,Science,77
6,Alisa,History,85
7,Bobby,History,63
8,Cathrine,History,42
9,Alisa,Economics,62


# Ranking the dataframe in python pandas on ascending order:

In [49]:
# Ranking of score ascending order
 
df4['score_ranked']=df['Score'].rank(ascending=1)
df4

Unnamed: 0,Name,Subject,Score,score_ranked
0,Alisa,Mathematics,62,10.5
1,Bobby,Mathematics,47,5.5
2,Cathrine,Mathematics,55,3.0
3,Alisa,Science,74,7.0
4,Bobby,Science,31,1.0
5,Cathrine,Science,77,8.5
6,Alisa,History,85,10.5
7,Bobby,History,63,5.5
8,Cathrine,History,42,2.0
9,Alisa,Economics,62,4.0


# Ranking the dataframe in python pandas on descending order

In [50]:
df4['score_ranked']=df['Score'].rank(ascending=0)
df4

Unnamed: 0,Name,Subject,Score,score_ranked
0,Alisa,Mathematics,62,2.5
1,Bobby,Mathematics,47,7.5
2,Cathrine,Mathematics,55,10.0
3,Alisa,Science,74,6.0
4,Bobby,Science,31,12.0
5,Cathrine,Science,77,4.5
6,Alisa,History,85,2.5
7,Bobby,History,63,7.5
8,Cathrine,History,42,11.0
9,Alisa,Economics,62,9.0


# Rank the dataframe in python pandas by minimum value of the rank

In [51]:
df4['score_ranked']=df['Score'].rank(ascending=0,method='min')
df4

Unnamed: 0,Name,Subject,Score,score_ranked
0,Alisa,Mathematics,62,2.0
1,Bobby,Mathematics,47,7.0
2,Cathrine,Mathematics,55,10.0
3,Alisa,Science,74,6.0
4,Bobby,Science,31,12.0
5,Cathrine,Science,77,4.0
6,Alisa,History,85,2.0
7,Bobby,History,63,7.0
8,Cathrine,History,42,11.0
9,Alisa,Economics,62,9.0


#  In-class lab exercise: 
Rank the dataframe in python pandas by maximum value of the rank?

# Rank the dataframe in python pandas by dense rank

In [52]:
df4['score_ranked']=df['Score'].rank(ascending=0,method='dense')
df4

Unnamed: 0,Name,Subject,Score,score_ranked
0,Alisa,Mathematics,62,2.0
1,Bobby,Mathematics,47,5.0
2,Cathrine,Mathematics,55,7.0
3,Alisa,Science,74,4.0
4,Bobby,Science,31,9.0
5,Cathrine,Science,77,3.0
6,Alisa,History,85,2.0
7,Bobby,History,63,5.0
8,Cathrine,History,42,8.0
9,Alisa,Economics,62,6.0


# Hierarchical indexing or multiple indexing in python 

In [53]:
import pandas as pd
import numpy as np
 
#Create a DataFrame
d = {
    'Name':['Alisa','Bobby','Cathrine','Alisa','Bobby','Cathrine',
            'Alisa','Bobby','Cathrine','Alisa','Bobby','Cathrine'],
    'Exam':['Semester 1','Semester 1','Semester 1','Semester 1','Semester 1','Semester 1',
            'Semester 2','Semester 2','Semester 2','Semester 2','Semester 2','Semester 2'],
     
    'Subject':['Mathematics','Mathematics','Mathematics','Science','Science','Science',
               'Mathematics','Mathematics','Mathematics','Science','Science','Science'],
   'Score':[62,47,55,74,31,77,85,63,42,67,89,81]}
 
df5 = pd.DataFrame(d,columns=['Name','Exam','Subject','Score'])
df5

Unnamed: 0,Name,Exam,Subject,Score
0,Alisa,Semester 1,Mathematics,62
1,Bobby,Semester 1,Mathematics,47
2,Cathrine,Semester 1,Mathematics,55
3,Alisa,Semester 1,Science,74
4,Bobby,Semester 1,Science,31
5,Cathrine,Semester 1,Science,77
6,Alisa,Semester 2,Mathematics,85
7,Bobby,Semester 2,Mathematics,63
8,Cathrine,Semester 2,Mathematics,42
9,Alisa,Semester 2,Science,67


In [54]:

df6=df5.set_index(['Exam', 'Subject'])
df6

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Score
Exam,Subject,Unnamed: 2_level_1,Unnamed: 3_level_1
Semester 1,Mathematics,Alisa,62
Semester 1,Mathematics,Bobby,47
Semester 1,Mathematics,Cathrine,55
Semester 1,Science,Alisa,74
Semester 1,Science,Bobby,31
Semester 1,Science,Cathrine,77
Semester 2,Mathematics,Alisa,85
Semester 2,Mathematics,Bobby,63
Semester 2,Mathematics,Cathrine,42
Semester 2,Science,Alisa,67


In [55]:
# View index
df6.index

MultiIndex(levels=[['Semester 1', 'Semester 2'], ['Mathematics', 'Science']],
           labels=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]],
           names=['Exam', 'Subject'])

# Swap the column in the hierarchical index:

In [56]:
df6.swaplevel('Subject','Exam')

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Score
Subject,Exam,Unnamed: 2_level_1,Unnamed: 3_level_1
Mathematics,Semester 1,Alisa,62
Mathematics,Semester 1,Bobby,47
Mathematics,Semester 1,Cathrine,55
Science,Semester 1,Alisa,74
Science,Semester 1,Bobby,31
Science,Semester 1,Cathrine,77
Mathematics,Semester 2,Alisa,85
Mathematics,Semester 2,Bobby,63
Mathematics,Semester 2,Cathrine,42
Science,Semester 2,Alisa,67


# Handling missing Data

In [57]:
import pandas as pd
import numpy as np

df7 = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])

df7 = df7.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print (df7)

        one       two     three
a -0.670537 -0.284556 -1.918253
b       NaN       NaN       NaN
c  0.358496 -0.547448 -0.684603
d       NaN       NaN       NaN
e  0.025641  0.468740  1.400746
f  1.555523 -0.250371  0.702731
g       NaN       NaN       NaN
h -0.626580 -0.734126 -1.045101


# Check for Missing Values

In [58]:
df7['one'].isnull()

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [59]:
print (df7['one'].notnull())

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool


# Fill NA Forward and Backward

In [60]:
print (df7.fillna(method='ffill'))

        one       two     three
a -0.670537 -0.284556 -1.918253
b -0.670537 -0.284556 -1.918253
c  0.358496 -0.547448 -0.684603
d  0.358496 -0.547448 -0.684603
e  0.025641  0.468740  1.400746
f  1.555523 -0.250371  0.702731
g  1.555523 -0.250371  0.702731
h -0.626580 -0.734126 -1.045101


In [61]:
print (df7.fillna(method='backfill'))

        one       two     three
a -0.670537 -0.284556 -1.918253
b  0.358496 -0.547448 -0.684603
c  0.358496 -0.547448 -0.684603
d  0.025641  0.468740  1.400746
e  0.025641  0.468740  1.400746
f  1.555523 -0.250371  0.702731
g -0.626580 -0.734126 -1.045101
h -0.626580 -0.734126 -1.045101


# Drop Missing Values

In [62]:
print (df7.dropna())

        one       two     three
a -0.670537 -0.284556 -1.918253
c  0.358496 -0.547448 -0.684603
e  0.025641  0.468740  1.400746
f  1.555523 -0.250371  0.702731
h -0.626580 -0.734126 -1.045101


In [63]:
print (df7.dropna(axis=1))

Empty DataFrame
Columns: []
Index: [a, b, c, d, e, f, g, h]


# Replace Missing Values

In [64]:
import pandas as pd
import numpy as np
df8 = pd.DataFrame({'one':[10,20,30,40,50,2000],
'two':[1000,0,30,40,50,60]})
print (df8.replace({1000:10,2000:60}))

   one  two
0   10   10
1   20    0
2   30   30
3   40   40
4   50   50
5   60   60


# Take home exercise
Description about the "mtcars" data set can be found in the below link
https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/mtcars.html

Create a new data frame from the "mtcars" provided to you as csv file. The new data frame must have the following colunms

Col 1 : Cubic capacity in cubic centemeters and this must be a whole number ( with floor and ceiling corrected based on > or < 0.5 respectively Hint : Use the round() function) the existing data frame contains this data in cubic inches (engine diplacement column)

1 cubic inch = 16.387 cubic centimeteres

Col 2 : Power is to Weight Ratio, weight of the car is provided in units per 1000 LBS, you would need to convert it to LB's first and then calculate the power/weight ratio. In case if this is in too low a decimal number, then you will have to represent it appropriately by convering it to a whole number which is readable.

Col 3 : Milage (Note: The places where NaN is marked have to be ignored for calculation and should be present as NaN in the final output)

The final output in the data frame is must be sorted based on the cc of the engine. The final output must contain the rows of only those car models where is engine capacity is greater than 2500 (where cc is converted from cubic inches to cubic centimeters)

The code should be well organized into user defined functions wherever applicable. The car names columns must be retained as the original data frame.