In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as sm
from sklearn.linear_model import LinearRegression
from sklearn import model_selection as cv

# 1. Data Preprocessing

- Load dataset

In [2]:
df = pd.read_csv("../data/autos.csv", encoding='latin-1')

In [3]:
del df["nrOfPictures"],df["lastSeen"],df["dateCreated"],df["name"],df["dateCrawled"], df["postalCode"], df["model"], df["monthOfRegistration"]

In [4]:
#Shifting price column to last column in df
price = df["price"].copy()
del df["price"]
df["price"] = price

- Removing null entries

In [5]:
df.isnull().sum()

seller                    0
offerType                 0
abtest                    0
vehicleType           37869
yearOfRegistration        0
gearbox               20209
powerPS                   0
kilometer                 0
fuelType              33386
brand                     0
notRepairedDamage     72060
price                     0
dtype: int64

In [6]:
df.dropna(inplace = True)

In [7]:
df

Unnamed: 0,seller,offerType,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,kilometer,fuelType,brand,notRepairedDamage,price
1,privat,Angebot,test,coupe,2011,manuell,190,125000,diesel,audi,ja,18300
3,privat,Angebot,test,kleinwagen,2001,manuell,75,150000,benzin,volkswagen,nein,1500
4,privat,Angebot,test,kleinwagen,2008,manuell,69,90000,diesel,skoda,nein,3600
5,privat,Angebot,test,limousine,1995,manuell,102,150000,benzin,bmw,ja,650
6,privat,Angebot,test,cabrio,2004,manuell,109,150000,benzin,peugeot,nein,2200
7,privat,Angebot,test,limousine,1980,manuell,50,40000,benzin,volkswagen,nein,0
10,privat,Angebot,control,limousine,2004,manuell,105,150000,benzin,mazda,nein,2000
11,privat,Angebot,control,kombi,2005,manuell,140,150000,diesel,volkswagen,ja,2799
14,privat,Angebot,control,suv,2011,manuell,190,70000,diesel,nissan,nein,17999
17,privat,Angebot,control,kleinwagen,2004,automatik,75,150000,benzin,renault,nein,1750


- Set X,Y variable using dataframe

In [8]:
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1].values

- Label Encoding to change strings to integers

In [9]:
labelEncoder_Seller = LabelEncoder()
X[:,0] = labelEncoder_Seller.fit_transform(X[:,0])
print(len(labelEncoder_Seller.classes_))

labelEncoder_OfferType = LabelEncoder()
X[:,1] = labelEncoder_OfferType.fit_transform(X[:,1])
print(len(labelEncoder_OfferType.classes_))

labelEncoder_AbTest = LabelEncoder()
X[:,2] = labelEncoder_AbTest.fit_transform(X[:,2])
print(len(labelEncoder_AbTest.classes_))

labelEncoder_VehicleType = LabelEncoder()
X[:,3] = labelEncoder_VehicleType.fit_transform(X[:,3])
print(len(labelEncoder_VehicleType.classes_))

labelEncoder_Gearbox = LabelEncoder()
X[:,5] = labelEncoder_Gearbox.fit_transform(X[:,5])
print(len(labelEncoder_Gearbox.classes_))

labelEncoder_FuelType = LabelEncoder()
X[:,8] = labelEncoder_FuelType.fit_transform(X[:,8])
print(len(labelEncoder_FuelType.classes_))

labelEncoder_Brand = LabelEncoder()
X[:,9] = labelEncoder_Brand.fit_transform(X[:,9])
print(len(labelEncoder_Brand.classes_))

labelEncoder_NotRepairedDamage = LabelEncoder()
X[:,10] = labelEncoder_NotRepairedDamage.fit_transform(X[:,10])
print(len(labelEncoder_NotRepairedDamage.classes_))

2
2
2
8
2
7
40
2


- One Hot Encoding to change categorical integers to dummy variables

In [10]:
oneHotEncoder = OneHotEncoder(categorical_features = [0,1,2,3,5,8,9,10])
X = oneHotEncoder.fit_transform(X).toarray()
#This will add the dummy features in order of passed array above and will place uncategorical features to last in same order.

- Avoiding Dummy Variable Trap

In [11]:
#Taking one less dummy variable from each set of categorical dummy variables
removed_columns =  set([1,3,5,13,15,22,62,64])
all_columns = set(range(0,68))
X = X[:,list(all_columns - removed_columns)]

- Using Standard Scalar to scale features

In [12]:
sc_X = StandardScaler()
X = sc_X.fit_transform(X)

# 2. Without Backward Elmination

In [13]:
X_train, X_test, Y_train, Y_test = cv.train_test_split(X,Y,test_size=0.25,random_state=0)
regressor = LinearRegression()
regressor.fit(X_train,Y_train)
regressor.score(X_test,Y_test)

3.2429986390081211e-05

## 3. Using Backward Elimination

#### Step 1 (Selecting Significance Level)

In [14]:
#Adding column of 1s for constant
X = np.append(arr = np.ones((X.shape[0],1)).astype(int),values = X,axis=1)
significance_level = 0.05

#### Step 2 (Include all features)

In [15]:
included_columns = [i for i in range(X.shape[-1])]

#### Step 3 and Step 4 (Fit Model with features available in included_columns and find p values for all untill all p values are less than significance level)

In [16]:
finished = False
while not finished:
    X_opt = X[:,included_columns]
    regressor_OLS = sm.OLS(endog=Y,exog=X_opt).fit()
    #print(regressor_OLS.summary())
    table_data = regressor_OLS.summary().tables[1].data
    p_values = [float(table_data[i][4]) for i in range(1,len(table_data))]
    if(max(p_values) > significance_level): 
        column_to_remove = p_values.index(max(p_values))
        print("Removing column",included_columns[column_to_remove])
        del included_columns[column_to_remove]
    else:
        finished = True
print("Included columns are ",included_columns)

Removing column 2
Removing column 23
Removing column 21
Removing column 46
Removing column 40
Removing column 48
Removing column 1
Removing column 39
Removing column 43
Removing column 41
Removing column 14
Removing column 18
Removing column 52
Removing column 35
Removing column 37
Removing column 22
Removing column 42
Removing column 31
Removing column 53
Removing column 17
Removing column 30
Removing column 20
Removing column 24
Removing column 34
Removing column 33
Removing column 29
Removing column 49
Removing column 54
Removing column 56
Removing column 27
Removing column 45
Removing column 26
Removing column 47
Removing column 28
Removing column 25
Removing column 36
Removing column 32
Removing column 7
Removing column 6
Removing column 9
Removing column 5
Removing column 10
Removing column 44
Removing column 15
Removing column 19
Removing column 38
Removing column 13
Removing column 16
Removing column 50
Removing column 8
Removing column 3
Removing column 58
Removing column 60
R

#### Step 5 (Use available features to build the model

In [17]:
#Modifying X according to results of backward elimination
X = X[:,included_columns]
X_train, X_test, Y_train, Y_test = cv.train_test_split(X,Y,test_size=0.25,random_state=0)
regressor = LinearRegression()
regressor.fit(X_train,Y_train)
regressor.score(X_test,Y_test) #Score with all-in model

4.9017819302554777e-05