In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from clean_data import clean_data, prepare_data
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import normalize

In [2]:
# Read files
train = pd.read_csv('../cars-competition/data/cars_train.csv', index_col='Id')
submission = pd.read_csv('../cars-competition/data/cars_test.csv', index_col='Id')

In [3]:
# Clean Data
train = clean_data(train)
submission = clean_data(submission)

Cleaning cylinders...
Cleaning condition...
Cleaning odometer...
Cleaning drive...
Cleaning size...
Cleaning manufacturer...
Cleaning fuel...
Cleaning transmission...
Cleaning title status...
Data cleaning complete!
Cleaning cylinders...
Cleaning condition...
Cleaning odometer...
Cleaning drive...
Cleaning size...
Cleaning manufacturer...
Cleaning fuel...
Cleaning transmission...
Cleaning title status...
Data cleaning complete!


In [4]:
# Prepare Data
# columns = ['year','manufacturer','condition','cylinders','fuel','odometer','title_status','transmission','drive','size']
columns = ['year','manufacturer','condition','cylinders','odometer','title_status','transmission','size','lat','long']
X, y = prepare_data(train,columns,typ='train')
X_sub, _ = prepare_data(submission,columns,typ='test')

Preparing data...
Data preparation complete!
Preparing data...
Data preparation complete!


In [5]:
# Split data by year
def yearSplit(X,y=pd.DataFrame(None)):
    decade = {i:(i*10+1900,i*10+1909) for i in range(12)}
    y_decade = {}
    for i in decade.keys():
        print('Spliting data by decade: {}'.format(decade[i]))
        dec = (X['year']>=decade[i][0]) & (X['year']<=decade[i][1])
        decade[i] = X[dec]
        if any(y != None):
            y_decade[i] = y[dec]
    return decade, y_decade

In [6]:
# Split data by year
print('Spliting Train')
X_dec,y_dec = yearSplit(X,y)
print('Spliting Test')
X_sub_dec,_ = yearSplit(X_sub)

Spliting Train
Spliting data by decade: (1900, 1909)
Spliting data by decade: (1910, 1919)
Spliting data by decade: (1920, 1929)
Spliting data by decade: (1930, 1939)
Spliting data by decade: (1940, 1949)
Spliting data by decade: (1950, 1959)
Spliting data by decade: (1960, 1969)
Spliting data by decade: (1970, 1979)
Spliting data by decade: (1980, 1989)
Spliting data by decade: (1990, 1999)
Spliting data by decade: (2000, 2009)
Spliting data by decade: (2010, 2019)
Spliting Test
Spliting data by decade: (1900, 1909)
Spliting data by decade: (1910, 1919)
Spliting data by decade: (1920, 1929)
Spliting data by decade: (1930, 1939)
Spliting data by decade: (1940, 1949)
Spliting data by decade: (1950, 1959)
Spliting data by decade: (1960, 1969)
Spliting data by decade: (1970, 1979)
Spliting data by decade: (1980, 1989)
Spliting data by decade: (1990, 1999)
Spliting data by decade: (2000, 2009)
Spliting data by decade: (2010, 2019)


In [7]:
# Normalize columns
norm = ['year','condition','cylinders','odometer','lat','long']
for i, X in X_dec.items():
    X[norm] = normalize(X[norm])
for i, X_sub in X_sub_dec.items():
    X_sub[norm] = normalize(X_sub[norm])

In [8]:
# Feature reduction PCA
n_components = [5,10,15,20,25,30,35,40]
#n_components = [5,10,15]
X_new_dec = {}
pca = {}
for i, X in X_dec.items():
    new = {}
    pca_n = {}
    for n in n_components:
        try:
            pca_n[n] = PCA(n_components = n)
            X_new = pca_n[n].fit_transform(X)
        except:
            pass
        new[n] = X_new
    X_new_dec[i] = new
    pca[i] = pca_n

In [9]:
# Split Train and Test
train_test_dec = {}
for i, dic in X_new_dec.items():
    train_test_n = {}
    for j, X_new in dic.items():
        X_train, X_test, y_train, y_test = train_test_split(X_new,y_dec[i],test_size=0.2,random_state=200)
        train_test_n[j] = [X_train, X_test, y_train, y_test]
    train_test_dec[i] = train_test_n

In [10]:
# GradientBoostingRegressor
gradb_reg_dec = {}
for i, dic in train_test_dec.items():
    gradb_reg_n = {}
    for j, [X_train, X_test, y_train, y_test] in dic.items():
        gradb_reg = GradientBoostingRegressor()
        gradb_reg_n[j] = gradb_reg.fit(X_train,y_train)
    gradb_reg_dec[i] = gradb_reg_n

KeyboardInterrupt: 

In [None]:
# Predict
y_pred_dec = {}
for i, dic in train_test_dec.items():
    y_pred_n = {}
    for j, [X_train, X_test, y_train, y_test] in dic.items():
        y_pred_n[j] = gradb_reg_dec[i][j].predict(train_test_dec[i][j][1])
    y_pred_dec[i]=y_pred_n

In [None]:
# Check error
error = {}
for i, dic in train_test_dec.items():
    err_n = {}
    for j, [X_train, X_test, y_train, y_test] in dic.items():
        err_n[j] = mean_squared_error(y_test, y_pred_dec[i][j])
    error[i]=err_n

In [111]:
# Check for optimal feature number for PCA
pca_n = {i:sorted([x for x in dic.items()],key=lambda x: x[1])[0] for i,dic in error.items()}
pca_n

{0: (5, 26691299144594.914),
 1: (25, 107872684.50958434),
 2: (5, 100821548317269.16),
 3: (5, 92140489772.70012),
 4: (30, 5665863916911744.0),
 5: (35, 2420419685034184.5),
 6: (20, 830917323014.7333),
 7: (10, 1468879537764030.5),
 8: (5, 495736826888361.06),
 9: (5, 122169205572.4949),
 10: (15, 103281285744554.27),
 11: (10, 84325841380436.4)}

In [95]:
# Try optimal Linear Regression for submission
y_sub = {}
for i, df in X_sub_dec.items():
    x = pca[i][pca_n[i]].transform(df)
    y_sub[i] = gradb_reg_dec[i][pca_n[i]].predict(x)

In [98]:
for i, y in y_sub.items():
    y_sub[i] = abs(y)

In [103]:
y_sub = np.concatenate(list(y_sub.values()))

In [104]:
y_sub

array([6245750.06808099, 6236620.94509115, 1645271.99390413, ...,
        135706.61069037,  135713.25590025,   14734.16099133])

In [107]:
# Prepare for submission
sub30 = submission
sub30['price'] = y_sub/100
sub30 = sub30['price']
sub30.to_csv('../cars-competition/data/sub_mixPCA_byDecade_hundreth', header=True, index=True)