In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from clean_data import clean_data, prepare_data, prepare_data_no_dummy, yearSplit, manufacturerSplit
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import normalize

In [2]:
# Read files
train = pd.read_csv('../cars-competition/data/cars_train.csv', index_col='Id')
submission = pd.read_csv('../cars-competition/data/cars_test.csv', index_col='Id')

In [3]:
# Clean Data
train = clean_data(train)
submission = clean_data(submission)

Cleaning cylinders...
Cleaning condition...
Cleaning odometer...
Cleaning drive...
Cleaning size...
Cleaning manufacturer...
Cleaning fuel...
Cleaning transmission...
Cleaning title status...
Data cleaning complete!
Cleaning cylinders...
Cleaning condition...
Cleaning odometer...
Cleaning drive...
Cleaning size...
Cleaning manufacturer...
Cleaning fuel...
Cleaning transmission...
Cleaning title status...
Data cleaning complete!


In [4]:
# Prepare Data
# columns = ['year','manufacturer','condition','cylinders','fuel','odometer','title_status','transmission','drive','size']
columns = ['year','manufacturer','condition','cylinders','odometer','title_status','transmission','size','lat','long']
X, y = prepare_data_no_dummy(train,columns,typ='train')
X_sub, _ = prepare_data_no_dummy(submission,columns,typ='test')

Preparing data...
Data preparation complete!
Preparing data...
Data preparation complete!


In [5]:
X

Unnamed: 0_level_0,year,manufacturer,condition,cylinders,odometer,title_status,transmission,size,lat,long
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
559327,2006.0,unknown,2,4,190000.000000,clean,automatic,mid-size,39.287000,-76.647600
1429566,2018.0,dodge,0,6,113584.991571,clean,automatic,full-size,37.720000,-89.215800
931606,1997.0,ford,0,6,113584.991571,clean,automatic,full-size,43.483300,-83.383500
1265412,2003.0,ram,0,8,113584.991571,clean,automatic,full-size,27.850069,-99.668883
1133731,2000.0,dodge,0,6,113584.991571,clean,automatic,full-size,29.165000,-81.539900
...,...,...,...,...,...,...,...,...,...,...
1505912,2005.0,ford,0,6,238047.000000,clean,automatic,full-size,48.077990,-123.159590
1436810,2016.0,hyundai,2,6,40516.000000,clean,automatic,full-size,42.278560,-88.031417
170772,2016.0,gmc,2,8,151763.000000,clean,automatic,full-size,34.683630,-86.577461
1649643,2016.0,hyundai,2,4,35028.000000,clean,automatic,mid-size,43.758910,-87.753580


In [6]:
X.shape

(469992, 10)

In [7]:
# Split data by Manufacturer
def yearSplit(X,y=pd.DataFrame(None)):
    decade = {manu:'' for manu in set(X['manufacturer'])}
    print(decade)
    y_decade = {}
    for i in decade.keys():
        print('Spliting data by manufacturer: {}'.format(i))
        dec = X['manufacturer']==i
        decade[i] = X[dec]
        if any(y != None):
            y_decade[i] = y[dec]
    return decade, y_decade

In [13]:
X_dec, y_dec = yearSplit(X,y)
X_sub_dec, _ = yearSplit(X_sub)

{'ferrari': '', 'kia': '', 'harley-davidson': '', 'morgan': '', 'mazda': '', 'mini': '', 'volvo': '', 'subaru': '', 'buick': '', 'cadillac': '', 'land rover': '', 'bmw': '', 'dodge': '', 'chrysler': '', 'mitsubishi': '', 'ford': '', 'porche': '', 'ram': '', 'chevrolet': '', 'infiniti': '', 'nissan': '', 'lincoln': '', 'mercedes-benz': '', 'jeep': '', 'gmc': '', 'volkswagen': '', 'hyundai': '', 'alfa-romeo': '', 'acura': '', 'aston-martin': '', 'pontiac': '', 'jaguar': '', 'toyota': '', 'fiat': '', 'honda': '', 'lexus': '', 'mercury': '', 'audi': '', 'saturn': '', 'datsun': '', 'unknown': ''}
Spliting data by manufacturer: 
Spliting data by manufacturer: 
Spliting data by manufacturer: 
Spliting data by manufacturer: 
Spliting data by manufacturer: 
Spliting data by manufacturer: 
Spliting data by manufacturer: 
Spliting data by manufacturer: 
Spliting data by manufacturer: 
Spliting data by manufacturer: 
Spliting data by manufacturer: 
Spliting data by manufacturer: 
Spliting data by 

In [14]:
shape = []
for i in X_sub_dec.values():
    shape.append(i.shape[0])

In [15]:
X_sub.shape

(253073, 10)

In [16]:
sum(shape)

253073

In [17]:
X_dec

{'ferrari':                 year manufacturer  condition  cylinders       odometer  \
 Id                                                                       
 1446117  1981.000000      ferrari          1          4   17000.000000   
 723398   2006.000000      ferrari          0          6  113584.991571   
 1321337  1994.000000      ferrari          2          8   40000.000000   
 608244   2010.000000      ferrari          1          8   17585.000000   
 684579   2010.000000      ferrari          0          8   55500.000000   
 162605   2010.000000      ferrari          2          6   17030.000000   
 635380   1997.000000      ferrari          2          6   28305.000000   
 315525   2000.000000      ferrari          2          8   46609.000000   
 212262   1985.000000      ferrari          0          6  113584.991571   
 1190071  2000.000000      ferrari          0          8   25580.000000   
 482666   2005.000000      ferrari          0          8    7926.000000   
 841806   2005

In [18]:
# Normalize columns
norm = ['year','condition','cylinders','odometer','lat','long']
for i, X in X_dec.items():
    X[norm] = normalize(X[norm])
for i, X_sub in X_sub_dec.items():
    X_sub[norm] = normalize(X_sub[norm])

In [19]:
X_dec

{'ferrari':              year manufacturer  condition  cylinders  odometer title_status  \
 Id                                                                            
 1446117  0.115744      ferrari   0.000058   0.000234  0.993263        clean   
 723398   0.017658      ferrari   0.000000   0.000053  0.999844        clean   
 1321337  0.049788      ferrari   0.000050   0.000200  0.998757        clean   
 608244   0.113560      ferrari   0.000056   0.000452  0.993506        clean   
 684579   0.036192      ferrari   0.000000   0.000144  0.999342        clean   
 162605   0.117210      ferrari   0.000117   0.000350  0.993077        clean   
 635380   0.070377      ferrari   0.000070   0.000211  0.997511        clean   
 315525   0.042871      ferrari   0.000043   0.000171  0.999079        clean   
 212262   0.017473      ferrari   0.000000   0.000053  0.999847      rebuilt   
 1190071  0.077948      ferrari   0.000000   0.000312  0.996952        clean   
 482666   0.245215      ferra

In [20]:
for i, dic in X_dec.items():
    print(dic)
    break

             year manufacturer  condition  cylinders  odometer title_status  \
Id                                                                            
1446117  0.115744      ferrari   0.000058   0.000234  0.993263        clean   
723398   0.017658      ferrari   0.000000   0.000053  0.999844        clean   
1321337  0.049788      ferrari   0.000050   0.000200  0.998757        clean   
608244   0.113560      ferrari   0.000056   0.000452  0.993506        clean   
684579   0.036192      ferrari   0.000000   0.000144  0.999342        clean   
162605   0.117210      ferrari   0.000117   0.000350  0.993077        clean   
635380   0.070377      ferrari   0.000070   0.000211  0.997511        clean   
315525   0.042871      ferrari   0.000043   0.000171  0.999079        clean   
212262   0.017473      ferrari   0.000000   0.000053  0.999847      rebuilt   
1190071  0.077948      ferrari   0.000000   0.000312  0.996952        clean   
482666   0.245215      ferrari   0.000000   0.000978

In [21]:
y_dec['lincoln']

Id
1299110     1200
194087      9999
58003       7300
615420      1400
97647       1500
           ...  
817243     12136
1388591     9299
506679      9500
1375991    11500
1123451     4200
Name: price, Length: 3553, dtype: int64

In [22]:
# Split Train and Test
train_test_dec = {}
for i, X_new in X_dec.items():
    X_train, X_test, y_train, y_test = train_test_split(X_new,y_dec[i],test_size=0.2,random_state=200)
    train_test_dec[i] = [X_train, X_test, y_train, y_test]

In [23]:
train_test_dec

{'ferrari': [             year manufacturer  condition  cylinders  odometer title_status  \
  Id                                                                            
  583172   0.110840      ferrari   0.000000   0.000331  0.993812        clean   
  482666   0.245215      ferrari   0.000000   0.000978  0.969363        clean   
  638012   0.091760      ferrari   0.000093   0.000371  0.995764        clean   
  76395    0.083374      ferrari   0.000083   0.000249  0.996503        clean   
  212262   0.017473      ferrari   0.000000   0.000053  0.999847      rebuilt   
  614715   0.067463      ferrari   0.000067   0.000268  0.997713        clean   
  1500489  0.044264      ferrari   0.000045   0.000178  0.999015        clean   
  225673   0.069071      ferrari   0.000000   0.000207  0.997607        clean   
  669494   0.074264      ferrari   0.000074   0.000296  0.997229        clean   
  683466   0.036192      ferrari   0.000000   0.000144  0.999342        clean   
  496613   0.1296

In [24]:
# Linear Regression
lin_reg_dec = {}
for j, [X_train, X_test, y_train, y_test] in train_test_dec.items():
    lin_reg = LinearRegression()
    lin_reg_dec[j] = lin_reg.fit(X_train.drop(columns=['manufacturer','title_status','transmission','size']),y_train.drop(columns=['manufacturer','title_status','transmission','size']))

In [25]:
lin_reg_dec

{'ferrari': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'kia': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'harley-davidson': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'morgan': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'mazda': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'mini': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'volvo': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'subaru': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'buick': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'cadillac': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'land rover': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None

In [26]:
# Predict
y_pred_dec = {}
for i, [X_train, X_test, y_train, y_test] in train_test_dec.items():
    y_pred_dec[i] = lin_reg_dec[i].predict(train_test_dec[i][1].drop(columns=['manufacturer','title_status','transmission','size']))

In [27]:
# Check error
error = {}
for i, [X_train, X_test, y_train, y_test] in train_test_dec.items():
    error[i] = mean_squared_error(y_test, y_pred_dec[i])

In [28]:
error

{'ferrari': 5948267708.054666,
 'kia': 70198781.32000004,
 'harley-davidson': 54105650.83417925,
 'morgan': 1062733517.3517891,
 'mazda': 35849196.9410954,
 'mini': 60336863.128620826,
 'volvo': 74288718.39847171,
 'subaru': 48395820.63363186,
 'buick': 52711328.36517765,
 'cadillac': 192008770.87808326,
 'land rover': 250539786.05716062,
 'bmw': 24108471638.012726,
 'dodge': 106911280825697.9,
 'chrysler': 36783728748.42921,
 'mitsubishi': 98840166111.41888,
 'ford': 92471142277262.02,
 'porche': 79040291.79787548,
 'ram': 10884794587.189919,
 'chevrolet': 161084961393.0348,
 'infiniti': 100085474.17920615,
 'nissan': 3474710599438.902,
 'lincoln': 2148958664.695621,
 'mercedes-benz': 720912157635184.9,
 'jeep': 193386289.94866368,
 'gmc': 108222222521.50026,
 'volkswagen': 100017452154.09772,
 'hyundai': 2096626489.9974241,
 'alfa-romeo': 103717087.24070063,
 'acura': 62939961.42808076,
 'aston-martin': 283454374.2847888,
 'pontiac': 38356339.030160934,
 'jaguar': 136616870.87737834,

In [29]:
X_sub_dec

{'ferrari':              year manufacturer  condition  cylinders  odometer title_status  \
 Id                                                                            
 873208   0.099008      ferrari   0.000000   0.000396  0.995075      salvage   
 1615923  0.017548      ferrari   0.000000   0.000053  0.999846        clean   
 1245573  0.062069      ferrari   0.000062   0.000373  0.998067        clean   
 657050   0.017443      ferrari   0.000000   0.000053  0.999847        clean   
 682622   0.116723      ferrari   0.000117   0.000467  0.993139        clean   
 1676954  0.997843      ferrari   0.000000   0.002976  0.049595        clean   
 576328   0.110840      ferrari   0.000000   0.000331  0.993812        clean   
 930632   0.099008      ferrari   0.000000   0.000396  0.995075      salvage   
 336544   0.152945      ferrari   0.000152   0.000608  0.988214      rebuilt   
 684874   0.074264      ferrari   0.000000   0.000296  0.997229        clean   
 622772   0.017522      ferra

In [30]:
# Try optimal Linear Regression for submission
y_sub = {}
for i, df in X_sub_dec.items():
    x = df.drop(columns=['manufacturer','title_status','transmission','size'])
    y_sub[i] = lin_reg_dec[i].predict(x)

In [31]:
X_sub_dec

{'ferrari':              year manufacturer  condition  cylinders  odometer title_status  \
 Id                                                                            
 873208   0.099008      ferrari   0.000000   0.000396  0.995075      salvage   
 1615923  0.017548      ferrari   0.000000   0.000053  0.999846        clean   
 1245573  0.062069      ferrari   0.000062   0.000373  0.998067        clean   
 657050   0.017443      ferrari   0.000000   0.000053  0.999847        clean   
 682622   0.116723      ferrari   0.000117   0.000467  0.993139        clean   
 1676954  0.997843      ferrari   0.000000   0.002976  0.049595        clean   
 576328   0.110840      ferrari   0.000000   0.000331  0.993812        clean   
 930632   0.099008      ferrari   0.000000   0.000396  0.995075      salvage   
 336544   0.152945      ferrari   0.000152   0.000608  0.988214      rebuilt   
 684874   0.074264      ferrari   0.000000   0.000296  0.997229        clean   
 622772   0.017522      ferra

In [32]:
for i, df in X_sub_dec.items():
    df['price'] = abs(y_sub[i])

In [33]:
list(X_sub_dec.values())

[             year manufacturer  condition  cylinders  odometer title_status  \
 Id                                                                            
 873208   0.099008      ferrari   0.000000   0.000396  0.995075      salvage   
 1615923  0.017548      ferrari   0.000000   0.000053  0.999846        clean   
 1245573  0.062069      ferrari   0.000062   0.000373  0.998067        clean   
 657050   0.017443      ferrari   0.000000   0.000053  0.999847        clean   
 682622   0.116723      ferrari   0.000117   0.000467  0.993139        clean   
 1676954  0.997843      ferrari   0.000000   0.002976  0.049595        clean   
 576328   0.110840      ferrari   0.000000   0.000331  0.993812        clean   
 930632   0.099008      ferrari   0.000000   0.000396  0.995075      salvage   
 336544   0.152945      ferrari   0.000152   0.000608  0.988214      rebuilt   
 684874   0.074264      ferrari   0.000000   0.000296  0.997229        clean   
 622772   0.017522      ferrari   0.0000

In [34]:
y_sub = pd.concat(list(X_sub_dec.values()),axis=0)['price']

In [35]:
y_sub.shape

(253073,)

In [36]:
y_sub.to_csv('../cars-competition/data/lin_reg_by_manufacturer.csv', header=True, index=True)

In [99]:
# Prepare for submission
sub30 = submission
sub30['price'] = y_sub.astype('int')
sub30 = sub30['price']
sub30.to_csv('../cars-competition/data/lin_reg_by_manufacturer.csv', header=True, index=True)

In [None]:
y_sub