# Introduction

In this notebook we will use Machine Learning Algorithms for Regression task in order to predict laptop prices

In [None]:
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
df = pd.read_csv("/kaggle/input/laptop-prices-dataset/laptopPrice.csv")

In [None]:
df.head()

In [None]:
df.nunique()

In [None]:
df.isna().sum()

In [None]:
df.shape

# Outliers detection and removal

In [None]:
sns.boxplot(df['Price'])

## Function to remove outliers using quantiles

Setting up IQR
$$Q3-Q1=IQR$$

And thus we can set up our lowest and highest boundaries:
* Lower boundary
$$Q1-1,5*IQR$$
* Highest boundary
$$Q3+1,5*IQR$$

Anything below or above lowest and highest boundaries respectively is considered an **OUTLIER**

In [None]:
def remove_outliers(df):
    perc = np.percentile(df['Price'], [0, 25, 50, 75, 100])
    iqr = perc[3] - perc[1]
    min_bound = perc[1] - 1.5*iqr
    max_bound = perc[3] + 1.5*iqr
    df.loc[df['Price'] > max_bound, 'Price'] = max_bound
    return df

In [None]:
df = remove_outliers(df)

In [None]:
sns.boxplot(df['Price'])

# Preprocessing the data

### Convert GB values to numerical values and rating preprocessing

In [None]:
def rating_preprocess(x):
    div = x.split()
    return int(div[0])

def memory_process(x):
    return int(x.split()[0])

df['ram_gb'] = df['ram_gb'].apply(memory_process)
df['ssd'] = df['ssd'].apply(memory_process)
df['hdd'] = df['hdd'].apply(memory_process)
df['graphic_card_gb'] = df['graphic_card_gb'].apply(memory_process)

not_categorical = ['ram_gb', 'ssd', 'hdd', 'graphic_card_gb']

df['rating'] = df['rating'].apply(rating_preprocess)

### Encoding

In [None]:
le = LabelEncoder()
encoders = []
feats = df.columns[:-4]
for i in feats:
    if i not in not_categorical:
        le.fit(df[i])
        df[i] = le.transform(df[i])
        encoders += [le]

In [None]:
df.head(3)

In [None]:
sns.pairplot(df, vars=['Price', 'Number of Ratings', 'Number of Reviews']+not_categorical)

In [None]:
features = []
for i in df.columns:
    if i != 'Price':
        features += [i]
target = 'Price'

In [None]:
mms = MinMaxScaler()
mms.fit(df)
all_cols = df.columns
df = mms.transform(df)
df = pd.DataFrame(df, columns=all_cols)
x = df.loc[:, features].values
y = df.loc[:, target].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

In [None]:
class Models:
    def __init__(self, train, test):
        self.train = train
        self.test = test
        
    def training(self, model, name):
        x, y = self.train
        
        model.fit(x, y)
        self.evaluate(model, name, False)
        return model
        
        
    def evaluate(self, model, name, plot = True):
        x, y = self.test
        y_pred = model.predict(x)
        mae = mean_absolute_error(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        print("{}\n{}\nMAE: {}\nMSE: {}\nr2: {}".format(name,"-"*20 ,mae, mse, r2))

# Kendall’s tau coefficient

Kendall’s tau coefficient is used for Ordinal data types. Kendall’s Tau coefficient and Spearman’s rank correlation coefficient assess statistical associations based on the ranks of the data.

Kendall’s correlation coefficient uses pairs of observations and determines the strength of association based on the pattern of concordance and discordance between the pairs

* Concordant variables. Example: $$if\,(x1 < x2)\,and\,(y1 < x2)\\or\\(x1 > x2)\,and\,(y1 > x2)$$ They are in the same order with respect to each variable
* Disconcordant variables. Example: $$if\,(x1 < x2)\,and\,(y1 > x2)\\or\\(x1 >x2)\,and\,(y1 < x2)$$ The values are arranged in opposite directions

In [None]:
corr = df.corr(method='kendall')

In [None]:
sns.heatmap(corr,annot=True)

In [None]:
corr['Price']

# Training models, Evalutation

Using Kendall's tau coefficient we have the above mentioned result. And looking at that list we can assume that processor_name, ram_gb, ssd are the only ones fit for being moderately correlated.

Hence, we would not select any special features for resulting model. We will leave it as it is.

In the training process evaluation is done by MAE, MSE and R^2 score

In [None]:
ml_all = Models((x_train, y_train), (x_test, y_test))

lnr = LinearRegression()
rfr = RandomForestRegressor(n_estimators=100, max_depth = 50, criterion='friedman_mse')
dtr = DecisionTreeRegressor(criterion='friedman_mse', max_depth=50)
abr = AdaBoostRegressor(n_estimators=70, learning_rate=0.21, loss='exponential')
xgb = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

models = [lnr, rfr, dtr, abr, xgb]
names = ['Linear Regression', 'Random Forest Regressor', 'Decision Tree Regressor',
        'Ada Boost Regressor', 'XGBRegressor']

In [None]:
trained = []
for i, j in zip(models, names):
    trained += [ml_all.training(i, j)]
    print()