In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,OneHotEncoder
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.feature_selection import SelectKBest,f_regression,mutual_info_regression
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,cross_val_predict,RandomizedSearchCV
from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge,Lasso
from sklearn.metrics import r2_score,accuracy_score
from sklearn.tree import DecisionTreeRegressor
import xgboost
from sklearn.ensemble import AdaBoostRegressor,BaggingRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from numpy import nan
import pickle

In [2]:
dataset=pd.read_csv('mercedesbenz.csv')
dataset

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8405,107.39,ak,s,as,c,d,aa,d,q,...,1,0,0,0,0,0,0,0,0,0
4205,8406,108.77,j,o,t,d,d,aa,h,h,...,0,1,0,0,0,0,0,0,0,0
4206,8412,109.22,ak,v,r,a,d,aa,g,e,...,0,0,1,0,0,0,0,0,0,0
4207,8415,87.48,al,r,e,f,d,aa,l,u,...,0,0,0,0,0,0,0,0,0,0


In [3]:
def load_data(dataset):
    if 'y' in dataset.columns:
        X=dataset.drop(columns=['y','ID'])
        y=dataset['y']
        IQR=y.quantile(0.75)-y.quantile(0.25)
        upper_threshold=y.median()+3*IQR
        lower_threshold=y.median()-3*IQR
        y=pd.DataFrame(np.where(y>upper_threshold,upper_threshold,y),columns=['y'])
        return X,y
    elif 'ID' in dataset.columns:
        X=dataset.drop(columns=['ID'])
        return X
    else:
        X=dataset.copy()
        return X

In [4]:
minmax=MinMaxScaler(feature_range=(0,1))
scaler=StandardScaler()
le=LabelEncoder()
oe=OrdinalEncoder()
ohe=OneHotEncoder(sparse=False,drop='first')
num_removed_cols=[]
cat_encoded_cols=[]
def preprocessing(datset,X):
    global cat_cols
    global num_cols
    cat_cols=X.select_dtypes(include='object').columns.values.tolist()
    num_cols=X.select_dtypes(exclude='object').columns.values.tolist()
    for col in num_cols:
        if X[col].nunique()==1:
            num_removed_cols.append(col)
    for col in cat_cols:
        Mean_encoded_col=dataset.groupby(col)['y'].median().sort_values(ascending=True).index
        encoded_col={k:i for i,k in enumerate(Mean_encoded_col,0)}
        encoded_dict={col:encoded_col}
        cat_encoded_cols.append(encoded_dict)

In [5]:
def transformation(X):
    for col in num_removed_cols:
        if col in X.columns:
            X=X.drop(labels=col,axis=1)
        else:
            continue
    for cols in cat_encoded_cols:
        for key,values in cols.items():
            if key in X.columns:
                for col in cat_cols:
                    if key==col:
                        X[col]=X[col].map(values)
                        X[[col]]=scaler.fit_transform(X[[col]])
            else:
                continue
    return X

In [6]:
def feature_selection(X,y):
    Selector=SelectKBest(score_func=f_regression,k=20)
    Selector.fit(X,y)
    X_new=Selector.fit_transform(X,y)
    X_new=pd.DataFrame(Selector.fit_transform(X,y),columns=X.columns[Selector.get_support()])
    global selected_columns
    selected_columns=X.columns[Selector.get_support()]
    return X_new

In [7]:
models={'Linear Regression':LinearRegression(),'Ridge Regression':Ridge(),'Lasso Regression':Lasso(),'Decision Tree':DecisionTreeRegressor(),
       'XGBoost':xgboost.XGBRegressor(),'AdaBoost':AdaBoostRegressor(),'Bagging Regressor':BaggingRegressor(),'Gradient Boosting':GradientBoostingRegressor(),
       'Random Forest': RandomForestRegressor()}
def train_model(X,y):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
    for model_names,model in models.items():
        if model_names=='XGBoost' :
            params={
                     "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
                     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
                     "min_child_weight" : [ 1, 3, 5, 7 ],
                     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
                     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ], 
                    }
            xgboost_test=RandomizedSearchCV(model,param_distributions=params,n_iter=15,\
                                 scoring='r2',n_jobs=-1,cv=5,verbose=8,random_state=42)
            # estimator=xgboost_test.best_estimator_
            # regressor=
            xgboost_model=xgboost_test.fit(X_train,y_train)
            xgboost_train=xgboost_model.predict(X_train)
            xgboost_pred=xgboost_model.predict(X_test)
            print('The accuracy for train is',r2_score(y_train,xgboost_train))
            print('The accuracy for test is',r2_score(y_test,xgboost_pred))
            pickle.dump(xgboost_test,open('XGBoost_regression.pkl','wb'))
            predictions=pd.DataFrame()
            predictions['y_train_true']=y_train
            predictions['y_train_pred']=xgboost_train.tolist()
            test_predictions=pd.DataFrame()
            test_predictions['y_test_true']=y_test
            test_predictions['y_test_pred']=xgboost_pred.tolist()
            return predictions,test_predictions
        else:
            continue

In [None]:
# data=pd.read_csv('test.csv')
# type(data)

In [8]:
def modelling():
    global y
    X,y=load_data(dataset)
    print(X)
    preprocessing(dataset,X)
    print('--------------------------')
    print(X)
    X=transformation(X)
    print('--------------------------')
    print(X)
    X=feature_selection(X,y)
    print('--------------------------')
    print(X)
    predictions,test_predictions=train_model(X,y)
    print('--------------------------')
    return X
# X=modelling()
# X

In [9]:
def test_model(data):
    modelling()
    global X
    X=load_data(data)
    X=transformation(X)
    X=X[selected_columns]
    xgboost_model=pickle.load(open('XGBoost_regression.pkl','rb'))
    predictions=pd.DataFrame(index=data['ID'])
    predictions['y']=xgboost_model.predict(X)
    return predictions

In [None]:
# selected_columns

In [10]:
X0='k'
X2='at'
X29=0
X54=0
X76=0
X127=0
X136=1
X162=0
X166=0
X178=0
X232=0
X250=0
X261=0
X263=1
X272=0
X276=0
X279=0
X313=0
X314=0
X328=0

In [11]:
data1=pd.DataFrame([[X0, X2, X29, X54, X76, X127, X136, X162, X166, X178,X232, X250, X261, X263, X272, 
        X276, X279, X313, X314,X328]],columns=['X0', 'X2', 'X29', 'X54', 'X76', 'X127', 'X136',' X162', 'X166', 'X178','X232', 'X250', 'X261', 'X263', 'X272','X276', 'X279', 'X313', 'X314','X328'])
data1

Unnamed: 0,X0,X2,X29,X54,X76,X127,X136,X162,X166,X178,X232,X250,X261,X263,X272,X276,X279,X313,X314,X328
0,k,at,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0


In [12]:
def app_transformation(X):
    for cols in cat_encoded_cols:
        for key,values in cols.items():
            if key in X.columns:
                X[key]=X[key].map(values)
                X[[key]]=scaler.transform(X[[key]])
            else:
                continue
    return X

In [13]:
def app_model(data):
    modelling()
    X=load_data(data)
    X=app_transformation(X)
    xgboost_model=pickle.load(open('XGBoost_regression.pkl','rb'))
    predictions=xgboost_model.predict(X)
    return predictions

In [15]:
predictions=app_model(data1)
predictions[0]
# output=predictions.iloc[0].values[0]

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
output

In [None]:
Test=test_model(data)
Test

In [None]:
Test

In [None]:
Test.iloc[0].values[0]

In [None]:
test_csv=Test.to_csv('Predictions.csv')

In [None]:
selected_columns

In [None]:
data['X0'].unique()

In [None]:
data['X2'].unique()