In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
class Data:
    def __init__(self, train_file_path, test_file_path, target_var, num_features, cat_features):
        '''creates training and test dataframes
        cleans datasets before doing feature engineering
        creates summary statistics and visualization of data as an EDA phase
        '''
        self.target_var = target_var
        self.num_features = num_features
        self.cat_features = cat_features
        self.all_cols = num_features + cat_features + target_var
        self.train_df = self._clean_data(train_file_path, corrupt=True, missing=True, outlier=True)
        self.test_df = self._clean_data(test_file_path, corrupt=True, missing=True, outlier=False)
        self.target_df = self.train_df[self.target_var]
        
    def target_summary(self):
        print('\nSummary of the target variable:')
        print(self.train_df[target_var].info())
        print(self.train_df[target_var].describe())
    
    def train_summary(self):
        print('\nSummary of the training file:')
        train_cols = self.train_df.drop(columns=target_var).shape[1]
        train_rows = self.train_df.shape[0]
        print('\nThe train file has {} features and {} data-points'.format(train_cols, train_rows))
        print('\nData types in each column:')
        print(self.train_df.dtypes)
        print('\nNumber of missing values in each column:')
        print(self.train_df.isna().sum())
        print('\nFirst 3 rows of the training data:')
        print(self.train_df.head(3))
        print('\nNumber of unique values in each column:')
        print(self.train_df.nunique())
        print('\nDescriptive stats of numerical features:')
        print(self.train_df[[col for col in self.train_df.columns if self.train_df[col].dtype != 'object']].describe())
        print('\nDescriptive stats of categorical features:')
        print(self.train_df[[col for col in self.train_df.columns if self.train_df[col].dtype == 'object']].describe())
        
    def test_summary(self):
        print('\nSummary of the test file:')
        test_cols = self.test_df.shape[1]
        test_rows = self.test_df.shape[0]
        print('\nThe test file has {} features and {} data-points'.format(test_cols, test_rows))
        print('\nData types in each column:')
        print(self.test_df.dtypes)
        print('\nNumber of missing values in each column:')
        print(self.test_df.isna().sum())
        print('\nFirst 3 rows of the test data:')
        print(self.test_df.head(3))
        print('\nNumber of unique values in each columns:')
        print(self.test_df.nunique())
        print('\nDescriptive stats of numerical features:')
        print(self.test_df[[col for col in self.test_df.columns if self.test_df[col].dtype != 'object']].describe())
        print('\nDescriptive stats of categorical features:')
        print(self.test_df[[col for col in self.test_df.columns if self.test_df[col].dtype == 'object']].describe())        
        
    def visualize_target_var(self):
        #target variable
        print('plotting the distribution of the target variable')
        plt.figure(figsize = (14, 4))
        plt.subplot(1, 2, 1)
        sns.boxplot(self.target_df)
        plt.title(self.target_var[0])
        plt.subplot(1, 2, 2)
        sns.distplot(self.target_df)
        plt.title(self.target_var[0])
        plt.show()
        
    def visualize_cat_features(self):
        #categorical variables of the training set
        print('plotting the distribution of categorical variables of the training data')
        plt.figure(figsize=(14, 18))
        i = 1
        for col in self.cat_features:
            if (col not in self.removed_cols) & (self.train_df[col].nunique() < 15):
                plt.subplot(4, 2, i)
                sns.countplot(self.train_df[col])
                plt.title(col)
                plt.xticks(rotation=90)
                plt.tight_layout()                
                
                plt.subplot(4, 2, i+1)
                sns.boxplot(x=col, y=self.target_var[0], data=self.train_df)
                plt.title(col)
                plt.xticks(rotation=90)
                plt.tight_layout()
                i += 2
        plt.show()
        
    def visualize_num_features(self):
        print('plotting the distribution of numerical variables of the training data')    
        #numerical variables of the training set
        plt.figure(figsize=(16, 7))
        i = 1
        for col in self.num_features:
            if col not in self.removed_cols:
                plt.subplot(3, 3, i)
                sns.distplot(self.train_df[col])
                plt.title(col)
                plt.tight_layout()
                i += 1
        plt.show()
        
    def plot_heatmap(self):
        # heatmap
        print('ploting heatmap to show correlations among variables')    
        plt.figure(figsize = (7, 6))
        sns.heatmap(self.train_df.corr(), cmap = 'Blues', annot = True, vmin = 0.2)
        plt.show()
    
    def _clean_data(self, file_path, corrupt=True, missing=True, outlier=True):
        df = self._load_data(file_path)
        self.removed_cols = []
        for col in df.columns:
            if df[col].nunique()/len(df[col]) > 0.95:
                df = df.drop(columns=[col])
                self.removed_cols.append(col)
        # fix corrupt data
        if corrupt:
            for col in ['Mileage', 'Engine', 'Power']:
                col_split = df[col].str.split(' ', expand=True)
                df[col] = col_split[0]
            df.replace(['null', 0], np.nan, inplace=True)
        # fix missing values
        if missing:
            df.dropna(thresh=5, inplace=True)
            for col in df.columns:
                if df[col].isna().sum()/len(df[col]) > 0.6:
                    df = df.drop(columns=[col])
                    self.removed_cols.append(col)
            for col in self.num_features:
                if col not in self.removed_cols:
                    df[col] = pd.to_numeric(df[col], errors='raise')
                    df[col].fillna(df[col].dropna().mean(), inplace=True)
            for col in self.cat_features:
                if col not in self.removed_cols:
                    df[col].fillna(df[col].mode(), inplace=True)                             
        # fix outlier data
        if outlier:
            for col in self.num_features:
                if col not in self.removed_cols:
                    pd.to_numeric(df[col])
                    min_range = np.min((df[col].describe()['mean'] - 5 * df[col].describe()['std']), 0)
                    max_range = df[col].describe()['mean'] + 5 * df[col].describe()['std']
                    df = df[(df[col] > np.min(min_range, 0)) & (df[col] < max_range)]       
        # return the cleaned dataframe
        return df
                
    def _load_data(self, file_path):
        return pd.read_csv(file_path)

In [None]:
class FeatureEngineering:
    '''performs feature engineering nad featire selection for modeling'''
    def __init__(self, data):
        self.data = data
        dummy_cols = []
        for col in data.cat_features:
            
            if data.train_df[col].nunique()/len(data.train_df[col]) < 0.1:
                for name, count in data.train_df[col].value_counts().items():
                    if count/len(data.train_df[col]) < 0.01:
                        data.train_df[col].replace(name, 'Rare', inplace=True)
            if data.test_df[col].nunique()/len(data.test_df[col]) < 0.1:
                for name, count in data.test_df[col].value_counts().items():
                    if count/len(data.test_df[col]) < 0.01:
                        data.test_df[col].replace(name, 'Rare', inplace=True)
            
            if data.train_df[col].nunique() > 10:
                from category_encoders.target_encoder import TargetEncoder
                encoder = TargetEncoder(cols=col)
                encoder.fit(data.train_df[col], data.train_df[data.target_var])
                data.train_df[col] = encoder.transform(data.train_df[col])
                data.test_df[col] = encoder.transform(data.test_df[col])
            else:
                dummy_cols.append(col)
        
        data.train_df = pd.get_dummies(data.train_df, columns=dummy_cols, drop_first = True)
        data.test_df = pd.get_dummies(data.test_df, columns=dummy_cols, drop_first = True)
        data.target_df = data.train_df[data.target_var]
        
#         if scale:
#             from sklearn.preprocessing import StandardScaler
#             scaler = StandardScaler()
#             data.train_df = pd.DataFrame(scaler.fit_transform(data.train_df), columns=data.train_df.columns)
#             data.test_df = pd.DataFrame(scaler.fit_transform(data.test_df), columns=data.test_df.columns)
#             self.scaler = scaler
#             data.target_df = data.train_df[data.target_var]

In [None]:
class Modeling:
    def __init__(self, models={}):   
        self.models = models
        
    def add_model(self, model_name, model):
        self.models[model_name] = model

    def modeling_summary(self, data, k_cross_val=3, processor=-1, metric='neg_mean_squared_error'):
        self.k_cross_val = k_cross_val
        self.metric = metric
        self.scores = {}
        best_score = -99999
        from sklearn.model_selection import cross_val_score

        for model_name, model in self.models.items():
            score_list = cross_val_score(estimator=model, X=data.train_df.drop(columns=target_var), y=data.train_df[target_var],
                                    cv=k_cross_val, n_jobs=processor, scoring=self.metric)
            self.scores[model_name] = round(score_list.mean(), 3)
            if score_list.mean() > best_score:
                self.best_score = round(score_list.mean(), 3)
                self.best_model = model
                self.best_model_name = model_name
        print("Here is the list of applied models and their '{}' scores:\n {}".format(self.metric, self.scores))
        print("\nThe best model was '{}' with '{}' score of {}".format(self.best_model_name, self.metric, self.best_score))
            
    def get_feature_importance(self):
        '''returns sorted features based on their importances,
        which were calculated using RandomForest method
        '''
        from sklearn.ensemble import RandomForestRegressor
        features = data.train_df.drop(columns=target_var).columns
        model = RandomForestRegressor()
        model.fit(data.train_df.drop(columns=target_var), data.target_df)
        importances = model.feature_importances_
        feature_importances = pd.DataFrame({'Feature':features, 'Importance':importances})
        feature_importances.sort_values(by='Importance', ascending=False, inplace=True)
        feature_importances.set_index('Feature', inplace=True, drop=True)
        return feature_importances
    
    def model_tuning(self, hyper_parameters):
        self.hyper_parameters = hyper_parameters
        from sklearn.model_selection import GridSearchCV
        grid_search = GridSearchCV(estimator=self.best_model,
                                         param_grid=self.hyper_parameters,
                                         scoring=self.metric,
                                         n_jobs=-1,
                                         cv=self.k_cross_val,
                                         return_train_score=True)
        grid_search.fit(X=data.train_df.drop(columns=target_var), y=data.target_df)
        print('Best combination of hyperparameters are: {}'.format(grid_search.best_params_))
        print("Best achieved '{}' score is: {}".format(self.metric, round(grid_search.best_score_, 3)))
        
    def save_results(self, file_path):
            self.best_model.fit(data.train_df.drop(columns=target_var), data.target_df)
            predictions = self.best_model.predict(data.test_df)
            predict_file = pd.DataFrame({'Car_ID': data.test_df.index, 'Car_Price': predictions})
            predict_file.to_csv(file_path + '/Predicted_Price.csv', index=False)

In [None]:
train_file_path = 'C:/Users/mehdi/Downloads/used-cars-price-prediction/train-data.csv'
test_file_path = 'C:/Users/mehdi/Downloads/used-cars-price-prediction/test-data.csv'
target_var = ['Price']
num_features = ['Year', 'Kilometers_Driven', 'Seats', 'Mileage', 'Engine', 'Power', 'New_Price']
cat_features = ['Name', 'Location', 'Fuel_Type', 'Transmission', 'Owner_Type']

In [None]:
data = Data(train_file_path, test_file_path, target_var, num_features, cat_features)

In [None]:
data.target_summary()
# data.train_summary()
# data.test_summary()

In [None]:
data.visualize_target_var()
# data.visualize_num_features()
# data.visualize_cat_features()
# data.heatmap()

In [None]:
myfeatures = FeatureEngineering(data)

In [None]:
mymodel = Modeling()

# add regression models
from sklearn.linear_model import LinearRegression
mymodel.add_model('lin_reg', LinearRegression())

from sklearn.ensemble import RandomForestRegressor
mymodel.add_model('rand_forest', RandomForestRegressor(n_estimators=60, n_jobs=-1,
                                        max_depth=15, min_samples_split=80,
                                        max_features=8))

mymodel.modeling_summary(data)

In [None]:
hyper_parameters = [{'n_estimators': [40, 60, 100],
                     'max_depth': [5, 15, 40],
                     'min_samples_split': [40, 80, 100],
                     'max_features': [5, 8, 11]
                    }]
mymodel.model_tuning(hyper_parameters)

In [None]:
mymodel.get_feature_importance()

In [None]:
mymodel.save_results('C:/Users/mehdi/Downloads/used-cars-price-prediction')