In [None]:
import pandas as pd
import numpy as np
import datetime
from datetime import date, timedelta
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
idx=pd.IndexSlice
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

In [None]:
data = pd.read_csv('Car_Rates.csv')
data.head()

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.dropna(inplace=True)

In [None]:
data.duplicated().sum()

In [None]:
data.duplicated(subset='Model').sum()

In [None]:
data.duplicated(subset='Car_name').sum()

In [None]:
df = data.drop(['Year','Brand', 'Model'], axis=1)

In [None]:
def year(data, feature):
    blanks=[]
    for x in data[feature]:
        blanks.append(x.split()[0])
    return blanks

In [None]:
df['Year']=year(df, 'Car_name')

In [None]:
def brand(data, feature):
    blanks=[]
    for x in data[feature]:
        blanks.append(x.split()[1])
    return blanks

In [None]:
df['Brand']=brand(df, 'Car_name')

In [None]:
def model(data, feature):
    blanks=[]
    for x in range(len(data[feature])):
        name = data[feature].str.split().iloc[x]
        if len(name) == 3:
            blanks.append(name[-1])
        elif len(name) >=3:
            blanks.append(name[-2:])
            
    for i in range(len(blanks)):
        if type(blanks[i])== list:
            blanks[i] = ' '.join(blanks[i]).strip('.')
            
    return blanks

In [None]:
df['model'] = model(data, 'Car_name')

In [None]:
data = df

In [None]:
data.reset_index(drop=True, inplace=True)

In [None]:
# Brand Value Counts in Dataset
df=data.Brand.value_counts().reset_index().sort_values('Brand',ascending=False)
plt.figure(figsize=(16,8))
sns.barplot(data=df, x='index', y='Brand')
plt.xticks(rotation=45)
plt.title('Brand Value Counts in Dataset');


In [None]:
#Brand Analysis 
data.Brand.unique()

In [None]:
#Brand Avg General Rate of the vehicles on different year. 
df = data.groupby(['Brand', 'Year'])['General_rate'].mean().unstack().T
df

In [None]:
df.loc[:, ['BMW', 'Mercedes-Benz', 'Porsche']].plot(figsize=(16, 4))
plt.title('Avg Overall General Rating Over the Year of BMW, Benz, and Porsche')

In [None]:
#Review analysis

In [None]:
#Brand with most number of reveiews
df = data.groupby('Brand')['Num_of_reviews'].sum().sort_values(ascending=False).reset_index()
plt.figure(figsize=(16, 4))
sns.barplot(data=df, x='Brand', y='Num_of_reviews')
plt.xticks(rotation=45)
plt.title("Brand's Overall Number of Reviews");

In [None]:
data.head()

In [None]:
#Return a mew dataframe with its avg value for each column
def avg(data):
    result_df = pd.DataFrame()
    for x in data.columns:
        if data[x].dtype != 'O' and x!='Num_of_reviews':
            df = data.groupby(['Brand', 'Year'])[x].mean().sort_values(ascending=False)
#             df = data.groupby(['Brand', 'Year'])[x].mean().sort_values(ascending=False)
#             "Year" can be added and will return a dataframe of each brand's avg value of each year
            result_df=result_df.append(df)
            
    return result_df.T

In [None]:
brand_avg = avg(data)
brand_avg

In [None]:
def year_avg(data):
    result_df = pd.DataFrame()
    for x in data.columns:
        if data[x].dtype != 'O' and x != 'Num_of_reviews':
            df = data.groupby('Year')[x].mean().sort_values(ascending=False)
            result_df = result_df.append(df)
            
    return result_df

In [None]:
year_avg_df = year_avg(data).T.sort_index()

In [None]:
year_avg_df

In [None]:
year_avg_df.plot(figsize=(16,7))
plt.title('Trend of Each Rating Over the Years')

In [None]:
#Each Brand's Overall Avg Rating ober the years

In [None]:
for x in data.Brand.unique():
    df = year_avg(data.loc[data['Brand']==x]).T.sort_index()
    df.plot(figsize=(16, 8))
    plt.title(f"{x}'s Overal Avg Rating Over the Years'")

In [None]:
# Car Brand Analysis

In [None]:
#BMW
BMW = data.loc[data['Brand'] == 'BMW']
BMW.reset_index(drop=True, inplace=True)

In [None]:
BMW.head()

In [None]:
def hist(data):
    for x in data.columns:
        if data[x].dtype != 'O':
            sns.histplot(data[x])
            plt.show()

In [None]:
hist(BMW)

In [None]:
BMW.model.unique()

In [None]:
#BMW Rating Over the years
df = year_avg(BMW).T.sort_index()
df

In [None]:
df.plot(figsize=(16, 7))
plt.title('BMW Overall Avg Rating Over the Years')

In [None]:
len(BMW.model.unique())

In [None]:
#BMW's TOP 10 model in Avg general rate
df=BMW.groupby('model')['General_rate'].mean().sort_values(ascending=False)[:10].reset_index()
df

In [None]:
#Avg Overal Rating for each BMW Model in each year 
def overall_rating(data):
    result_df = pd.DataFrame()
    for x in data.columns:
        if data[x].dtype != 'O':
            df = data.groupby(['model', 'Year'])[x].mean().sort_values(ascending=False)
            result_df=result_df.append(df)
    return result_df.T.sort_index()

BMW_overall_rating = overall_rating(BMW)
BMW_overall_rating

In [None]:
BMW.corr()['Comfort'].sort_values(ascending=False)

In [None]:
sns.pairplot(BMW)

In [None]:
sns.heatmap(BMW.corr(), annot=True)

In [None]:
# Benz Analysis 

In [None]:
Benz=data.loc[data['Brand']=='Mercedes-Benz'].reset_index(drop=True)
Benz.head()

In [None]:
hist(Benz)

In [None]:
#Benz's TOP 10 model in Avg general rate#
Benz.groupby('model')['General_rate'].mean().sort_values(ascending=False)[:10].reset_index()

In [None]:
# Avg Overall rating of each model of Benz Cars
overall_rating(Benz)

In [None]:
df = year_avg(Benz).T.sort_index()
df

In [None]:
df.plot(figsize=(16, 8))
plt.title("Benz Overall Avg Rating Over the Years")

In [None]:
Benz.corr()['Comfort'].sort_values(ascending=False)

In [None]:
sns.pairplot(Benz)

In [None]:
sns.heatmap(Benz.corr(), annot=True)

In [None]:
#Porche Analysis

In [None]:
Porsche = data.loc[data['Brand']=='Porsche'].reset_index(drop=True)

In [None]:
for x in Porsche.columns:
    if Porsche[x].dtype != 'O':
        sns.histplot(data[x], kde=True)
        plt.show()

In [None]:
df=year_avg(Porsche).T
df

In [None]:
df.plot(figsize=(16, 8))
plt.title('Porsche Overall Avg Rating Over the Years')

In [None]:
overall_rating(Porsche)

In [None]:
Porsche.corr()['Comfort'].sort_values(ascending=False)

In [None]:
sns.pairplot(Porsche)

In [None]:
#Prediction of General _rate

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop(['Car_name', 'Brand', 'model', 'General_rate'], axis=1)
X['Year'] = X.loc[:, 'Year'].astype(float)

y = data.loc[:, 'General_rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost

In [None]:
#DecisionTreeRegressor

In [None]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)

In [None]:
y_hat_train_tree = tree.predict(X_train)

In [None]:
mean_squared_error(y_hat_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree, X_train, y_train,
                        scoring='neg_mean_squared_error', cv=10)

tree_rmse_in_sample_estimates = np.sqrt(-scores)

tree_rmse_in_sample_estimates

In [None]:
def display_scores(scores):
    print('===============================================')
    print('Scores: {}'.format(scores))
    print('===============================================')
    print('Mean Score: {}'.format(scores.mean()))
    print('===============================================')
    print('Standard Deviation of Scores: {}'.format(scores.std()))
    print('===============================================')
    
    return None

In [None]:
display_scores(tree_rmse_in_sample_estimates)

In [None]:
#RandomForestRegressor

In [None]:
random = RandomForestRegressor()
random.fit(X_train, y_train)

In [None]:
y_hat_train_random = random.predict(X_train)

In [None]:
mean_squared_error(y_hat_train_random, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(random, X_train, y_train,
                        scoring='neg_mean_squared_error', cv=10)

rand_rmse_in_sample_estimates = np.sqrt(-scores)

rand_rmse_in_sample_estimates

In [None]:
display_scores(rand_rmse_in_sample_estimates)

In [None]:
#Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
random.get_params()

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'log2'],
}

In [None]:
grid_search = GridSearchCV(random,
                          param_grid,
                          cv=5,
                          scoring='neg_mean_squared_error',
                          return_train_score=True,
                          refit=True)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_

In [None]:
plt.barh(X_train.columns, feature_importances)

In [None]:
sorted(zip(X_train.columns, feature_importances))

In [None]:
final_model = grid_search.best_estimator_

In [None]:
final_model.fit(X_test, y_test)

In [None]:
y_hat_test_final = final_model.predict(X_test)

In [None]:
mean_squared_error(y_test, y_hat_test_final)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(final_model, X_train, y_train,
                        scoring='neg_mean_squared_error', cv=10)

final_model_rmse_in_sample_estimates = np.sqrt(-scores)

final_model_rmse_in_sample_estimates

In [None]:
display_scores(final_model_rmse_in_sample_estimates)

In [None]:
data2 = pd.read_csv('New_York_cars.csv').drop(['currency', 'brand', 'Year'], axis=1)

In [None]:
data2['Year'] = year(data2, 'name')

In [None]:
data2['Brand'] = brand(data2, 'name')

In [None]:
data2.info()

In [None]:
data2.isna().sum()

In [None]:
data2.drop_duplicates(inplace=True)

In [None]:
data2.reset_index(drop=True, inplace=True)

In [None]:
#Mileage Percentage of used cars in the data / Simple Mileage analysis (used cars) 

In [None]:
def mileage(data, feature):
    blanks = []
    for x in data[feature]:
        if x <= 50000:
            blanks.append('0 - 50k')
        elif 500000 < x <= 100000:
            blanks.append('50k - 100k')
        elif 100000 < x <= 200000:
            blanks.append('100k - 200k')
        elif 200000 < x <= 300000:
            blanks.append('200k - 300k')
        elif 300000 < x <= 400000:
            blanks.append('300k - 400k')
        elif 400000 < x <= 500000:
            blanks.append('400k - 500k')
        elif 500000 < x <= 600000:
            blanks.append('500k - 600k')
        elif 600000 < x <= 700000:
            blanks.append('600k - 700k')
        else:
            blanks.append('700k+')
    return blanks

In [None]:
used_cars = data2.loc[data2['new&used']=='Used']

In [None]:
used_cars['mileage distribution'] = mileage(used_cars, 'Mileage')

In [None]:
df = used_cars['mileage distribution'].value_counts().reset_index()

fig, ax = plt.subplots()
ax.pie(
    x=df['mileage distribution'],
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
        )

plt.title('Mileage Distribution among Used Cars', fontsize=12);

In [None]:
sns.histplot(used_cars['Mileage'], kde=True)

In [None]:
#Mileage Histogtam for each Brand

for x in used_cars.Brand.unique():
    df = used_cars.loc[used_cars['Brand'] == x]
    sns.histplot(df['Mileage'])
    plt.title(f'{x}')
    plt.show()
    
#Mileage & Price for trend for brand please see below

In [None]:
#New & Used Car Distribution among each Brand 

In [None]:
for x in data2.Brand.unique():
    
    df = data2.loc[data2['Brand']==x].loc[:, 'new&used'].value_counts().reset_index()
    
    fig, ax = plt.subplots()
    ax.pie(
    x=df['new&used'],
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
        )

    plt.title(f'New & Used Car Distribution for {x}', fontsize=12);   

In [None]:
#Brand Price analysis

In [None]:
#Histogram of each brands's price
for x in data.Brand.unique():
    sns.histplot(data2.loc[data2['Brand']==x].loc[:, 'money'])
    plt.title(f"{x}")
    plt.show()

In [None]:
df=data2.groupby('Brand')['money'].mean().sort_values(ascending=False).reset_index()
plt.figure(figsize=(16, 8))
sns.barplot(data=df, x='Brand', y='money')
plt.xticks(rotation=45)
plt.title('Average Price for each Brand');

In [None]:
df=data2.groupby('Brand')['money'].max().sort_values(ascending=False).reset_index()
plt.figure(figsize=(16, 8))
sns.barplot(data=df, x='Brand', y='money')
plt.xticks(rotation=45)
plt.title('Most Expensive Car Pirce for each Brand');

In [None]:
#Trend of each brand's avg price over the years
df = data2.groupby(['Brand', 'Year'])['money'].mean().unstack().T
df

In [None]:
df.plot(figsize=(16, 8))
plt.title("Overall Trend of each Brand's Avg Price over the Years");

In [None]:
#Price Distribution for each car brand

In [None]:
def price(data, feature):
    price_ranges = []
    for x in data[feature]:
        if x <= 10000:
            price_ranges.append('0 - 10k')
        elif 10000 < x <= 20000:
            price_ranges.append('10k - 20k')
        elif 20000 < x <= 30000:
            price_ranges.append('20k - 30k')
        elif 30000 < x <= 40000:
            price_ranges.append('30k - 40k')
        elif 40000 < x <= 50000:
            price_ranges.append('40k - 50k')
        elif 50000 < x <= 60000:
            price_ranges.append('50k - 60k')
        elif 60000 < x <= 70000:
            price_ranges.append('60k - 70k')
        elif 70000 < x <= 80000:
            price_ranges.append('70k - 80k')
        elif 70000 < x <= 80000:
            price_ranges.append('70k - 80k')
        elif 80000 < x <= 90000:
            price_ranges.append('80k - 90k')
        elif 90000 < x <= 100000:
            price_ranges.append('90k - 100k')
        else:
            price_ranges.append('100k+')
            
    return price_ranges

In [None]:
#Price Distribution in the dataset
data2['Price Distribution'] = price(data2, 'money')
df = data2['Price Distribution'].value_counts().sort_values(ascending=False).reset_index()

fig, ax = plt.subplots(figsize=(8, 8))

ax.pie(
    x=df['Price Distribution'],
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
#     shadow=True,
)

plt.title(f'Price Distribution of Dataset', fontsize=12);

In [None]:
data2['Price Distribution'].value_counts().plot(kind='bar', rot=45)

In [None]:
#Price distribution for different brand
for x in data2.Brand.unique():
    df = data2.loc[data2['Brand']==x]
    df['price_distribution']=price(df, 'money')
    
    df_plot = df['price_distribution'].value_counts().reset_index()
    
    fig, ax = plt.subplots()
    ax.pie(
    x=df_plot['price_distribution'],
    startangle=90,
    labels=df_plot['index'],
    autopct="%.0f%%",
    shadow=True,
        )

    plt.title(f'Price Distribution for {x}', fontsize=12);

In [None]:
for x in data2.Brand.unique():
    df = data2.loc[data2['Brand'] == x].loc[:, 'Price Distribution'].value_counts().sort_values(ascending=False).reset_index()
    plt.figure(figsize=(14, 4))
    sns.barplot(data=df, x='index', y='Price Distribution')
    plt.title(f'Price Distribution for {x}', fontsize=12)
    plt.show()

In [None]:
# brand's avg price of each Drivetrain
data2.groupby(['Brand', 'Drivetrain'])['money'].mean().unstack().T

In [None]:
# brand's avg price of each fuel type
data2.groupby(['Brand', 'Fuel type'])['money'].mean().unstack().T

In [None]:
data2['Accidents or damage'].fillna('No Record found', inplace=True)

In [None]:
data2.groupby('Accidents or damage')['money'].mean()

In [None]:
# brand's avg price of each accidental record
df = data2.groupby(['Brand', 'Accidents or damage'])['money'].mean().unstack()
df

In [None]:
df.plot(kind='bar', figsize=(16,8), rot=45)
plt.title("Brand's Avg Price of each Accidental Record")

In [None]:
data2['Clean title'].fillna('No Record Found', inplace=True)

In [None]:
df

In [None]:
df = data2.groupby(['Brand', 'Clean title'])['money'].mean().unstack()
df.plot(kind='bar', figsize=(16, 8), rot=45)
plt.title("Brand's Avg Price for Clean Title")

In [None]:
# Avg Price of New/Used/Certified Cars for each brand
for x in data2.Brand.unique():
    df = data2.loc[data2['Brand'] == x].groupby('new&used')['money'].mean().sort_values(ascending=False).reset_index()
    sns.barplot(data=df, x='new&used', y='money')
    plt.title(f"{x}'s Avg Price of New/Used/Certified Cars", fontsize=12)
    plt.show()
    

In [None]:
df=used_cars.groupby('mileage distribution')['money'].mean().sort_values(ascending=False).reset_index()
df

In [None]:
plt.figure(figsize=(16, 6))
sns.barplot(data=df, x='mileage distribution', y='money')
plt.title('Avg Used Car Price among Differnet Mileage Distribution')

In [None]:
#Avg Used Car Price among Differnet Mileage Distribution of differnet brand
used_cars.groupby(['Brand', 'mileage distribution'])['money'].mean().unstack().T

In [None]:
#Column Distribution 

In [None]:
data2['Personal use only'].fillna('No Record Found', inplace=True)

In [None]:
data2['1-owner vehicle'].fillna('No Record Found', inplace=True)

In [None]:
data2['Fuel type'].value_counts().sort_values(ascending=False)

In [None]:
for x in ['Fuel type', 'Drivetrain', 'Accidents or damage', 'Clean title', 'Personal use only']:
    
    df = data2[x].value_counts().sort_values(ascending=False).reset_index()
    
    if len(df) > 10:
        plt.figure(figsize=(16, 4))
        plt.title(f'{x} Value Counts Distribution in the Dataset')
        sns.barplot(data=df, y='index', x=x)
    else:
        fig, ax = plt.subplots()
        ax.pie(
            x=df[x],
            startangle=90,
            labels=df['index'],
            autopct="%.0f%%",
            shadow=True,
            )
        plt.title(f'{x} Value Counts Distribution in the Dataset', fontsize=12)

In [None]:
#Model Analysis

In [None]:
#Top 15 Model count in each car brand 

In [None]:
for x in data2.Brand.unique():
    df = data2.loc[data2['Brand'] == x].Model.value_counts().sort_values(ascending=False)[:15].reset_index()
    plt.figure(figsize=(16, 4))
    sns.barplot(data=df, x='index', y='Model')
    plt.xticks(rotation=45)
    plt.title(f'Top 15 Model for {x} in Dataset')
    plt.show()

In [None]:
#Top 15 Average Price for each model of each brand

In [None]:
for x in data2.Brand.unique():
    df = data2.loc[data2['Brand'] == x].groupby('Model')['money'].mean().sort_values(ascending=False)[:15].reset_index()
    plt.figure(figsize=(16, 4))
    sns.barplot(data=df, x='Model', y='money')
    plt.xticks(rotation=45)
    plt.title(f"{x}'s Top 15 Model in terms of Average Price")
    plt.show()

In [None]:
#Top 15 Model with Highest Average Mileage for each brand

In [None]:
for x in used_cars.Brand.unique():
    df = used_cars.loc[data2['Brand'] == x].groupby('Model')['Mileage'].mean().sort_values(ascending=False)[:15].reset_index()
    plt.figure(figsize=(16, 4))
    sns.barplot(data=df, x='Model', y='Mileage')
    plt.xticks(rotation=45)
    plt.title(f"{x}'s Top 15 Model in terms of Average Mileage")
    plt.show()

In [None]:
#Apple Carplay / 

In [None]:
CarPlay = data2.loc[data2['Entertainment'].str.contains("Apple CarPlay", na=False)]

In [None]:
#Number of Cars that has Apple CarPlay in each Brand
df = CarPlay.Brand.value_counts().reset_index()
plt.figure(figsize=(16,4))
sns.barplot(data=df, x='index', y='Brand')
plt.xticks(rotation=45)
plt.title('Number of Cars that has Apple CarPlay in each Brand');

In [None]:
#Average Car Price that has Apple CarPlay of Each Brand

In [None]:
df = CarPlay.groupby('Brand')['money'].mean().sort_values(ascending=False).reset_index()
plt.figure(figsize=(16,4))
sns.barplot(data=df, x='Brand', y='money')
plt.xticks(rotation=45)
plt.title('Avg Price for Cars that has Apple CarPlay for each Brand');

In [None]:
#Apple CarPlay Distribution among New/Used/Certified Cars for each Brand

In [None]:
for x in data.Brand.unique():
    df=CarPlay.loc[CarPlay['Brand']==x].loc[:, 'new&used'].value_counts().reset_index()
    if not df.empty:
        sns.barplot(data=df, x='index', y='new&used')
        plt.title(f'Apple CarPlay Distribution among New/Used/Certified Cars for {x}', fontsize=10)
        plt.show()
        

In [None]:
#Find avg ['money', 'mileage'] for each brand of different years

In [None]:
def NY_avg(data):
    result_df = pd.DataFrame()
    for y in data.columns:
        if data[y].dtype == 'int64' or data[y].dtype == 'float64' :
            df = data.groupby(['Brand', 'Year'])[y].mean()
            result_df = result_df.append(df)
            
    return result_df.T.sort_index()

In [None]:
NY_avg(data2).sort_index()

In [None]:
NY_avg(used_cars)

In [None]:
df = pd.get_dummies(data2.loc[:, 
                              ['Accidents or damage', 
                               'Clean title', 
                               '1-owner vehicle', 
                               'Personal use only', 
                               'Drivetrain']])

dfm = pd.concat([
    data2.drop(['Accidents or damage', 'Clean title', '1-owner vehicle', 'Personal use only','Drivetrain'], axis=1),
    df
], axis=1)

dfm['Year'] = dfm.Year.astype(float)

In [None]:
data2['Personal use only'].unique()

In [None]:
def cruise_control(data, feature):
    blanks=[]
    for x in data[feature]:
        if 'Adaptive Cruise Control' in x:
            blanks.append('1')
        elif 'No Record Found' in x:
            blanks.append('3')
        else:
            blanks.append('2')

    return blanks

In [None]:
data2['cruise_control'] = cruise_control(data2, 'Convenience')

In [None]:
cruise = pd.get_dummies(data2['cruise_control'])

cruise = cruise.rename(columns={'1':'cruise_control_Yes', 
                       '2':'cruise_control_No',
                       '3':'cruise_control_No Record Found'})

In [None]:
dfm = pd.concat([
    dfm, cruise
], axis=1)

In [None]:
dfm.head()

In [None]:
abs(dfm.corr()['money']).sort_values(ascending=False)

In [None]:
sns.heatmap(dfm.corr())

In [None]:
data2.head()

In [None]:
data2.info()

In [None]:
def avg__(data):
    result_df = pd.DataFrame()
    for x in data.columns:
        if data[x].dtype != 'O':
            df = data.groupby(['Brand', 'Year'])[x].mean()
            result_df = result_df.append(df)
    
    return result_df.T.sort_index()

In [None]:
avg__(data2)