In [1]:
#Milestone 2: Flat file

import pandas as pd

#Importing the CSV file
cars = pd.read_csv('CO2 Emissions_Canada.csv')

#Removing all categorical series besides the make and model. Also removing the fuel consumption (MPG) series as it is redundant. 
cars[['Make', 'Model', 'Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)', 'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)', 'CO2 Emissions(g/km)']]

#Capitalizing all strings to ensure duplicates can be aggregated
cars['Make'] = cars['Make'].str.upper()
cars['Model'] = cars['Model'].str.upper()

#Removing all hybrid vehicles from the data frame, as they are outliers
cars = cars[~cars['Model'].str.contains('HYBRID')]

#Grouping the cars by make and model then aggregating duplicate models by taking the mean of the values
cars = cars.groupby(['Make', 'Model'], as_index = False).mean()

#Dropping the trim names from the model by splitting the model names and only returning the first word to the series.
#Trim names do not follow any naming convention, and thus, make the data harder to analyze.
A = cars['Model'].str.split(' ')
new_models = []
for item in A:
    #6 car brands have model names with multiple words, so these 6 will have the subsequent words returned as well.
    if item[0] == 'GRAND' or item[0] == 'SANTA' or item[0] == 'NEW' or item[0] == 'FLYING' or item[0] == 'RANGE':
        new_models.append(item[0] + ' ' + item[1])
    elif item[0] == 'TOWN':
        new_models.append(item[0] + ' ' + item[1] + ' ' + item[2])
    else:
        new_models.append(item[0])

#Converting the truncated model names to a Pandas series and adding it as the new 'Model' column in the data frame.
cars['Model'] = pd.Series(new_models)

#Now that the trim names have been dropped, I will aggregate all of the same models to give one trim average for each model.
cars = cars.groupby(['Make', 'Model'], as_index = False).mean()

#Mazda has a bad habit of adding their make name to some of their models. I remove the make name here to avoid confusion.
cars['Model'] = cars['Model'].str.replace('MAZDA2', '2')
cars['Model'] = cars['Model'].str.replace('MAZDA3', '3')
cars['Model'] = cars['Model'].str.replace('MAZDA5', '5')
cars['Model'] = cars['Model'].str.replace('MAZDA6', '6')

#Final result: A data frame of cars cleaned of outliers, where each row represents one unique car model (the average of all its trim levels) and each column is a numerical attribute. 
cars[cars['Make'] == 'LEXUS']

Unnamed: 0,Make,Model,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
256,LEXUS,CT,1.8,4.0,5.5,5.875,5.675,50.0,131.0
257,LEXUS,ES,3.166667,5.333333,9.257143,6.952381,8.214286,36.285714,191.190476
258,LEXUS,GS,3.285714,5.714286,11.684082,8.244898,10.113878,28.561905,236.110204
259,LEXUS,GX,4.6,8.0,15.9,12.314286,14.285714,19.428571,331.714286
260,LEXUS,IS,3.05,5.8,11.951048,8.468524,10.38481,27.399048,240.312381
261,LEXUS,LC,4.25,7.0,11.966667,8.233333,10.266667,28.666667,239.833333
262,LEXUS,LS,4.075,7.0,12.775,8.958333,11.041667,26.291667,257.166667
263,LEXUS,LX,5.7,8.0,18.557143,13.285714,16.185714,17.571429,377.428571
264,LEXUS,NX,2.1,4.0,10.013333,8.5,9.32,30.8,217.333333
265,LEXUS,RC,3.5,6.0,12.743333,8.892,10.878,26.14,253.413333
