# DS Technical Mock Interview

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np

Data is available here on Kaggle: https://www.kaggle.com/brentpafford/true-car-listings-2017-project?select=true_car_project_full.csv

In [2]:
df = pd.read_csv('true_car_project_full.csv')
df.head()

Unnamed: 0,Id,Price,Year,Mileage,City,State,City State,Vin,Make,Model,Region
0,1,16472,2015,18681,Jefferson City,MO,Jefferson City MO,KL4CJBSBXFB267643,Buick,EncoreConvenience,Midwest
1,2,15749,2015,27592,Highland,IN,Highland IN,KL4CJASB5FB245057,Buick,EncoreFWD,Midwest
2,3,16998,2015,13650,Boone,NC,Boone NC,KL4CJCSB0FB264921,Buick,EncoreLeather,Southeast
3,4,15777,2015,25195,New Orleans,LA,New Orleans LA,KL4CJASB4FB217542,Buick,EncoreFWD,Southeast
4,5,16784,2015,22800,Las Vegas,NV,Las Vegas NV,KL4CJBSB3FB166881,Buick,EncoreConvenience,Rockies


In [5]:
df.shape

(1048575, 11)

## Preprocessing

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 11 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Id          1048575 non-null  int64 
 1   Price       1048575 non-null  int64 
 2   Year        1048575 non-null  int64 
 3   Mileage     1048575 non-null  int64 
 4   City        1048575 non-null  object
 5   State       1048575 non-null  object
 6   City State  1048575 non-null  object
 7   Vin         1048575 non-null  object
 8   Make        1048575 non-null  object
 9   Model       1048575 non-null  object
 10  Region      1048575 non-null  object
dtypes: int64(4), object(7)
memory usage: 88.0+ MB


Create a new column that is the mixture of make and model with an underscore between them. Call this new column "Make_Model"

In [8]:
df['Make_Model'] = df['Make'] + '_' + df['Model']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 12 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Id          1048575 non-null  int64 
 1   Price       1048575 non-null  int64 
 2   Year        1048575 non-null  int64 
 3   Mileage     1048575 non-null  int64 
 4   City        1048575 non-null  object
 5   State       1048575 non-null  object
 6   City State  1048575 non-null  object
 7   Vin         1048575 non-null  object
 8   Make        1048575 non-null  object
 9   Model       1048575 non-null  object
 10  Region      1048575 non-null  object
 11  Make_Model  1048575 non-null  object
dtypes: int64(4), object(8)
memory usage: 96.0+ MB


Drop the City, State, City State, Vin, Make, and Model columns

In [9]:
drop = ['City', 'State', 'City State', 'Vin', 'Make', 'Model']

df = df.drop(drop, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Id          1048575 non-null  int64 
 1   Price       1048575 non-null  int64 
 2   Year        1048575 non-null  int64 
 3   Mileage     1048575 non-null  int64 
 4   Region      1048575 non-null  object
 5   Make_Model  1048575 non-null  object
dtypes: int64(4), object(2)
memory usage: 48.0+ MB


In [10]:
#Check for NAs
df.isna().sum()

Id            0
Price         0
Year          0
Mileage       0
Region        0
Make_Model    0
dtype: int64

In [11]:
#Check for duplicatess
duplicate = df[df.duplicated()]
duplicate

Unnamed: 0,Id,Price,Year,Mileage,Region,Make_Model


## ML

Here we will make a small linear regression model to predict the price of a used car. There are a lot of different Make_Model combinations in the data, so we will be using only the 30 most frequent of these. 

In [12]:
#Get a list of the 30 car make_models that were most frequent
top_30 = [name for name in df.Make_Model.value_counts().head(30).index]

Here we will create a new version of the dataframe. Take only the rows where the Make_Model is in top_30 

In [16]:
df = df.loc[df["Make_Model"].isin(top_30), :]
df.info()
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 283292 entries, 681 to 1048570
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Id          283292 non-null  int64 
 1   Price       283292 non-null  int64 
 2   Year        283292 non-null  int64 
 3   Mileage     283292 non-null  int64 
 4   Region      283292 non-null  object
 5   Make_Model  283292 non-null  object
dtypes: int64(4), object(2)
memory usage: 15.1+ MB
                 Id          Price           Year       Mileage
count  2.832920e+05  283292.000000  283292.000000  2.832920e+05
mean   5.059333e+05   22684.978764    2013.476519  5.353672e+04
std    3.080870e+05   11212.785643       3.295757  4.309149e+04
min    6.820000e+02    1500.000000    1997.000000  5.000000e+00
25%    2.072075e+05   14988.000000    2012.000000  2.495975e+04
50%    5.289455e+05   19800.000000    2014.000000  4.072600e+04
75%    7.506432e+05   28998.000000    2016.000000  7.187300e+04
m

Get dummy variables for the categorical variables.

In [18]:
# Binary encoding using Pandas (multiple columns)
df_binary_encoded = pd.get_dummies(df, drop_first=True)
df_binary_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 283292 entries, 681 to 1048570
Data columns (total 38 columns):
 #   Column                            Non-Null Count   Dtype
---  ------                            --------------   -----
 0   Id                                283292 non-null  int64
 1   Price                             283292 non-null  int64
 2   Year                              283292 non-null  int64
 3   Mileage                           283292 non-null  int64
 4   Region_Northeast                  283292 non-null  bool 
 5   Region_Pacific                    283292 non-null  bool 
 6   Region_Rockies                    283292 non-null  bool 
 7   Region_Southeast                  283292 non-null  bool 
 8   Region_Southwest                  283292 non-null  bool 
 9   Make_Model_BMW_5                  283292 non-null  bool 
 10  Make_Model_Chevrolet_CamaroCoupe  283292 non-null  bool 
 11  Make_Model_Chevrolet_EquinoxAWD   283292 non-null  bool 
 12  Make_Model_Chevrol

Please explain what the following code chunk does. 

In [19]:
# Creating the scaler instance
data_scaler = StandardScaler()
# Fitting the scaler
data_scaler.fit(df_binary_encoded)
# Transforming the data
df_data_scaled = data_scaler.transform(df_binary_encoded)

In [20]:
features = df_binary_encoded.iloc[:, 2:]
target = df_binary_encoded.iloc[:, 1]

Do a train/test split, with 20% test data.

In [24]:
#Train test split

train, test = train_test_split(df_binary_encoded, test_size=.2, random_state=777)

train_features = train[features.columns]
train_target = train[target.name]

test_features = test[features.columns]
test_target = test[target.name]

print(train_features.shape)
print(train_target.shape)
print(test_features.shape)
print(test_target.shape)

(226633, 36)
(226633,)
(56659, 36)
(56659,)


Train a linear regression model with the data and get predictions.

In [25]:
# Instantiate linear regression model

model = LinearRegression()
model.fit(train_features, train_target)

test_pred = model.predict(test_features)

#Get predictions 

In [26]:
mean_squared_error(test_pred, test_target)

34214886.71569756

In [27]:
import math
math.sqrt(mean_squared_error(test_pred, test_target))

5849.349255746109

Please describe what MSE is. Also please interpret the sqrt of the MSE.