# About Dataset

## CONTEXT  
The used and refurbished device market has grown considerably over the past decade as it provide cost-effective alternatives to both consumers and businesses that are looking to save money when purchasing one. Maximizing the longevity of devices through second-hand trade also reduces their environmental impact and helps in recycling and reducing waste. Here is a sample dataset of normalized used and new pricing data of refurbished / used devices.

## OBJECTIVE  
The objective is to do Exploratory Data Analytics and apply Linear Regression to create a model which can help in pricing of such devices.  

## Variables  
__device_brand__: Name of manufacturing brand  
__os__: OS on which the device runs  
__screen_size__: Size of the screen in cm  
__4g__: Whether 4G is available or not  
__5g__: Whether 5G is available or not  
__front_camera_mp__: Resolution of the rear camera in megapixels  
__back_camera_mp__: Resolution of the front camera in megapixels  
__internal_memory__: Amount of internal memory (ROM) in GB  
__ram__: Amount of RAM in GB  
__battery__: Energy capacity of the device battery in mAh  
__weight__: Weight of the device in grams  
__release_year__: Year when the device model was released  
__days_used__: Number of days the used/refurbished device has been used  
__normalized_new_price__: Normalized price of a new device of the same model  
__normalized_used_price (TARGET)__: Normalized price of the used/refurbished device  

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import norm, boxcox
from scipy import stats
from collections import Counter
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import statsmodels.api as sm
import pickle as pk

# Getting data

In [None]:
data = pd.read_csv('used_device_data.csv')

# Initial Review

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

# Checking Missing Values

In [None]:
data.isnull().sum()

In [None]:
sns.heatmap(data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

## Missing Values found!

In [None]:
data = data.dropna()

## Missing values dropped!

In [None]:
sns.heatmap(data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

# Checking duplicate values

In [None]:
data.duplicated().sum()

## No duplicate values found!

## Mapping categorical features(4G,5G)

In [None]:
dict_G = {'yes':1,'no':0}

In [None]:
data['4g'] = data['4g'].map(dict_G)

In [None]:
data['5g'] = data['5g'].map(dict_G)

In [None]:
brand_dict = data.groupby(['device_brand'])['normalized_used_price'].median().to_dict()
data['Device_Brand'] = data['device_brand'].map(brand_dict)

In [None]:
data.head()

# Exploring Categorical features

In [None]:
len(data.device_brand.value_counts())

In [None]:
data.device_brand.unique()

In [None]:
Max_brand = data['device_brand'].value_counts().sort_values(ascending=False)[0:20]
Max_brand.name = 'Count'
Max_brand.index.name = 'Brands'

In [None]:
fig = plt.figure(figsize=(14,8))
sns.barplot(x=Max_brand.index,y= Max_brand)
plt.tight_layout()

In [None]:
fig = plt.figure(figsize=(15,8))
sns.barplot(y=data['Device_Brand'],x=data['device_brand'])
plt.xticks(rotation=90);

In [None]:
data.os.value_counts()

In [None]:
sns.countplot(data['os'])

## Android phones are highest in number

In [None]:
data['5g'].value_counts()

In [None]:
sns.countplot(data['5g'])

## There are very few phones with 5G

In [None]:
data['4g'].value_counts()

In [None]:
sns.countplot(data['4g'])

## There are more phones 4G enabled

In [None]:
data['ram'].value_counts()

In [None]:
sns.countplot(data['ram'])

## Most phones have 4GB Ram

In [None]:
data['release_year'].value_counts()

In [None]:
sns.countplot(data['release_year'])

In [None]:
data['internal_memory'].value_counts()

In [None]:
sns.countplot(data['internal_memory'])
plt.tight_layout()

## Relation between target variable and categorical features

In [None]:
fig,ax = plt.subplots(2,2,figsize = (15,10))

sns.boxplot(x = 'os', y = 'normalized_used_price', data = data,ax=ax[0,0])
ax[0,0].set_title('os vs normalized_used_price')

sns.boxplot(x = '4g', y = 'normalized_used_price', data = data,ax=ax[0,1])
ax[0,1].set_title('4g vs normalized_used_price')

sns.boxplot(x = '5g', y = 'normalized_used_price', data = data,ax=ax[1,0])
ax[1,0].set_title('5g vs normalized_used_price')

sns.boxplot(x = 'release_year', y = 'normalized_used_price', data = data,ax=ax[1,1])
ax[1,1].set_title('release_year vs normalized_used_price')

plt.tight_layout()
plt.show()

## Through these visualizations we can see that : 
## -  phones with os as ios are priced higher than phones with other os
## -  5G enabled phones are priced higher
## -  4G enabled phones are priced higher
## -  Phones from recent years are priced higher than phones from previous years

In [None]:
data.columns

In [None]:
numerical_features = ['screen_size','rear_camera_mp','front_camera_mp','battery','weight','days_used','normalized_new_price', 'normalized_used_price']

# Checking distribution of data

In [None]:
data.hist(figsize=(12,8),bins=30)
plt.tight_layout()

## Function to visualize relationship between target variable and numerical features

In [None]:
def bivariate_analysis(x):
    plt.figure(figsize=(10,6))
    ax = sns.regplot(x=x, y='normalized_used_price',data=data)
    ax.set_title("Used Price vs "+x, fontsize=25)
    ax.set_xlabel(x, fontsize=20)
    ax.set_ylabel('normalized_used_price', fontsize=20)
    plt.locator_params(axis='x', nbins=10)

In [None]:
cols = ['screen_size','rear_camera_mp','front_camera_mp','battery','weight','days_used','ram','internal_memory','normalized_new_price']
for x in cols:
    bivariate_analysis(x)
plt.tight_layout()

# Multivariate Analysis

In [None]:
sns.pairplot(data[numerical_features],diag_kind='kde')

In [None]:
fig = plt.figure(figsize=(12,8))
sns.heatmap(data.corr(),annot=True)

# Relation between Features  
- __Used Price__ has the highest <font color=green>__positive__</font> correlation with __New Price__, followed by __battery__, __front camera__, __screen size__, __rear camera__ and __4G__.
- __Used Price__ has the highest <font color=red>__negative__</font> correlation with __Weight__ followed by __Days Used__. 

### There is a <font color=green>__positive__</font> correlation between  
   -  __Battery__ and __Screen Size__ __(HIGH)__    
   -  __Release Year__ and __Front Camera__ __(MODERATE)__  
   -  __Front Camera__ and __4G__ __(MODERATE)__  
   -  __Front Camera__ and __Rear Camera__ __(MODERATE)__  
   -  __Release Year__ and __4G__ __(MODERATE)__  
   -  __Rear Camera__ and __4G__ __(LOW)__  
   -  __Rear Camera__ and __New Price__ __(LOW)__  
   -  __New Price__ and __Battery__ __(LOW)__
   -  __New Price__ and __Screen Size__ __(LOW)__  
   -  __New Price__ and __Front Camera__ __(LOW)__    

### There is a <font color=red>__negative__</font> correlation between  
   -  __Battery__ and __Weight__ __(HIGH)__  
   -  __Screen Size__ and __Weight__ __(HIGH)__    
   -  __Days Used__ and __Release Year__ __(MODERATE)__    
   -  __Days used__ and __Front Camera__ __(MODERATE)__   

In [None]:
cols2 = ['screen_size','rear_camera_mp','front_camera_mp','battery','weight','days_used','normalized_new_price','normalized_used_price']

# Checking Outliers

In [None]:
fig,ax=plt.subplots(2,4,figsize=(12,8))
index=0
ax=ax.flatten()
for col in cols2:
    sns.boxplot(y=col, data=data, color='r', ax=ax[index])
    index+=1
plt.tight_layout(pad=0.5, w_pad=1, h_pad=5.0)

## Function to detect outliers

In [None]:
def detect_outliers(columns):
    outlier_indices = []

    for column in columns:
        # 1st quartile
        Q1 = np.percentile(data[column], 25)
        # 3st quartile
        Q3 = np.percentile(data[column], 75)
        # IQR
        IQR = Q3 - Q1
        # Outlier Step
        outlier_step = IQR * 1.5
        # detect outlier and their indeces
        outlier_list_col = data[(data[column] < Q1 - outlier_step)
                              | (data[column] > Q3 + outlier_step)].index
        # store indeces
        outlier_indices.extend(outlier_list_col)
        return outlier_indices

In [None]:
len(detect_outliers(cols2))

## 440 Outliers found!

# Checking skewness of data

In [None]:
def check_skweness(columnName):
    print('''Before Correcting''')
    try:
        (mu, sigma) = norm.fit(data[columnName])
    except RuntimeError:
        (mu,sigma) = norm.fit(data[columnName].dropna())
    print("Mu before correcting {} : {}, Sigma before correcting {} : {}".format(
        columnName.upper(), mu, columnName.upper(), sigma))
    plt.figure(figsize=(20,10))
    plt.subplot(1,2,1)
    sns.distplot(data[columnName], fit=norm, color="orange")
    plt.title(columnName.upper() +
              " Distplot before Skewness Correction", color="black")
    plt.subplot(1,2,2)
    stats.probplot(data[columnName], plot=plt)
    plt.show();

In [None]:
skew_check_cols = ['screen_size','rear_camera_mp','front_camera_mp','battery','weight','days_used']
for columns in skew_check_cols:
    check_skweness(columns)

## Function to apply different tranformations on variables to get normalized data

In [None]:
def trying_different_transformations(column,transformation):
    if transformation=='boxcox':
        try:
            print("BoxCox - "+column)
            temp,temp_params = boxcox(data[column]+1)
            (mu,sigma)=norm.fit(temp)
            print("mu ",mu," sigma ",sigma)
            plt.figure(figsize=(20,10))
            plt.subplot(1,2,1)
            sns.distplot(temp, fit=norm, color="orange")
            plt.subplot(1,2,2)
            stats.probplot(temp, plot = plt)
        except ValueError:
            pass
        except ValueError:
            pass
    elif transformation=='log':
        try:
            print("Log - "+column)
            (mu,sigma)=norm.fit(np.log1p(data[column]))
            print("mu ",mu," sigma ",sigma)
            plt.figure(figsize=(20,10))
            plt.subplot(1,2,1)
            sns.distplot(np.log1p(data[column]), fit=norm, color="orange")
            plt.subplot(1,2,2)
            stats.probplot(np.log1p(data[column]), plot = plt)
        except RuntimeError:
            pass
        except ValueError:
            pass
    elif transformation=='reciprocal':
        try:
            print("Reciprocal - "+column)
            temp_r = 1/data[column]
            temp_r = temp_r.replace([np.inf, -np.inf], 0)
            (mu,sigma)=norm.fit(temp_r)
            print("mu ",mu," sigma ",sigma)
            plt.figure(figsize=(20,10))
            plt.subplot(1,2,1)
            sns.distplot(temp_r, fit=norm, color="orange")
            plt.subplot(1,2,2)
            stats.probplot(temp_r, plot = plt)
        except RuntimeError:
            pass
        except ValueError:
            pass
    elif transformation=='sqroot':
        try:
            print("Square_Root - "+column)
            (mu,sigma)=norm.fit(data[column]**(1/2))
            print("mu ",mu," sigma ",sigma)
            plt.figure(figsize=(20,10))
            plt.subplot(1,2,1)
            sns.distplot(data[column]**(1/2), fit=norm, color="orange")
            plt.subplot(1,2,2)
            stats.probplot(data[column]**(1/2), plot = plt)
        except RuntimeError:
            pass
        except ValueError:
            pass
    else:
        try:
            print("Exponential - "+column)
            (mu,sigma)=norm.fit(data[column]**(1/1.2))
            print("mu ",mu," sigma ",sigma)
            plt.figure(figsize=(20,10))
            plt.subplot(1,2,1)
            sns.distplot(data[column]**(1/1.2), fit=norm, color="orange")
            plt.subplot(1,2,2)
            stats.probplot(data[column]**(1/1.2), plot = plt)
        except RuntimeError:
            pass
        except ValueError:
            pass

## Transformations to try !

In [None]:
transformations = ['boxcox','log','reciprocal','sqroot','exp']

## Trying different transformations on screen_size feature

In [None]:
for x in transformations:
    trying_different_transformations('screen_size',x)

## Trying different transformations on rear_camera_mp feature

In [None]:
for x in transformations:
    trying_different_transformations('rear_camera_mp',x)

## Trying different transformations on front_camera_mp feature

In [None]:
for x in transformations:
    trying_different_transformations('front_camera_mp',x)

## Trying different transformations on battery feature

In [None]:
for x in transformations:
    trying_different_transformations('battery',x)

## Trying different transformations on weight feature

In [None]:
for x in transformations:
    trying_different_transformations('weight',x)

## Trying different transformations on days_used feature

In [None]:
for x in transformations:
    trying_different_transformations('days_used',x)

# Applying transformations

In [None]:
def skweness_correction(columnName):    
    if columnName == 'front_camera_mp' or columnName == 'screen_size' or columnName == 'battery':
        data[columnName], temp_params = boxcox(
        data[columnName]+1)
    elif columnName == 'weight':
        data[columnName] = 1/data[columnName].replace([np.inf, -np.inf], 0)
    elif columnName =='rear_camera_mp':
        data[columnName] = data[columnName]**(1/2)
    print('''After Correcting''')
    (mu, sigma) = norm.fit(data[columnName])
    print("Mu after correcting {} : {}, Sigma after correcting {} : {}".format(
        columnName.upper(), mu, columnName.upper(), sigma))
    plt.figure(figsize=(20, 10))
    plt.subplot(1,2,1)
    sns.distplot(data[columnName], fit=norm, color="orange")
    plt.title(columnName.upper() +
              " Distplot After Skewness Correction", color="black")
    plt.subplot(1,2,2)
    stats.probplot(data[columnName], plot = plt)
    plt.show();

In [None]:
skewColumnList1 = ['screen_size','rear_camera_mp','front_camera_mp','battery','weight']
for columns in skewColumnList1:
    skweness_correction(columns)

# Dropping Outliers!

In [None]:
len(detect_outliers(cols2))

In [None]:
data = data.drop(detect_outliers(cols2),axis = 0).reset_index(drop = True)

In [None]:
data.shape[0]

# Encoding categorical features

In [None]:
data = data.drop(['os','device_brand'],axis=1)

In [None]:
data

In [None]:
dummies_year = pd.get_dummies(data['release_year'],drop_first=True)
data = pd.concat([data,dummies_year],axis=1)
data = data.drop('release_year',axis=1)

In [None]:
data

# Train and Test Split

In [None]:
Y = data['normalized_used_price']
X = data.loc[:, data.columns != 'normalized_used_price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [None]:
X_train.head()

In [None]:
X_train.columns

# Scaling features

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_test.shape[0]

# Linear Regression Model

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train,y_train)
X_train_Prediction = linear_model.predict(X_train)
print("MSE : ",mean_squared_error(y_train,X_train_Prediction))
print("R2 Score : ",r2_score(y_train,X_train_Prediction))

In [None]:
cross_linear = cross_val_score(linear_model,X_train,y_train,scoring="neg_mean_squared_error",cv=5)
mean_cross_linear = np.mean(cross_linear)
print(mean_cross_linear)

# Ridge Regression Model

In [None]:
ridge_model = Ridge()
ridge_model.fit(X_train,y_train)
X_train_pred_ridge = ridge_model.predict(X_train)
print("MSE : ",mean_squared_error(y_train,X_train_pred_ridge))
print("R2 Score : ",r2_score(y_train,X_train_pred_ridge))

In [None]:
cross_ridge = cross_val_score(ridge_model,X_train,y_train,scoring="neg_mean_squared_error",cv=5)
mean_cross_ridge = np.mean(cross_ridge)
print(mean_cross_ridge)

# Lasso Regression Model

In [None]:
lasso_model = Lasso()
lasso_model.fit(X_train,y_train)
X_train_pred_lasso = lasso_model.predict(X_train)
print("MSE : ",mean_squared_error(y_train,X_train_pred_lasso))
print("R2 Score : ",r2_score(y_train,X_train_pred_lasso))


In [None]:
cross_lasso = cross_val_score(lasso_model,X_train,y_train,scoring="neg_mean_squared_error",cv=5)
mean_cross_lasso = np.mean(cross_lasso)
print(mean_cross_lasso)

# SVM Regressor

In [None]:
svmreg_model = SVR()
svmreg_model.fit(X_train,y_train)
X_train_pred_svmreg = svmreg_model.predict(X_train)
print("MSE : ",mean_squared_error(y_train,X_train_pred_svmreg))
print("R2 Score : ",r2_score(y_train,X_train_pred_svmreg))

In [None]:
cross_svmreg = cross_val_score(svmreg_model,X_train,y_train,scoring="neg_mean_squared_error",cv=5)
mean_cross_svmreg = np.mean(cross_svmreg)
print(mean_cross_svmreg)

# DecisionTree Regressor

In [None]:
dtree_model = DecisionTreeRegressor(max_depth=10)
dtree_model.fit(X_train,y_train)
X_train_pred_dtree = dtree_model.predict(X_train)
print("MSE : ",mean_squared_error(y_train,X_train_pred_dtree))
print("R2 Score : ",r2_score(y_train,X_train_pred_dtree))

In [None]:
cross_dtree = cross_val_score(dtree_model,X_train,y_train,scoring="neg_mean_squared_error",cv=5)
mean_cross_dtree = np.mean(cross_dtree)
print(mean_cross_dtree)

# RandomForest Regressor

In [None]:
rfr_model = RandomForestRegressor()
rfr_model.fit(X_train,y_train)
X_train_pred_rfr = rfr_model.predict(X_train)
print("MSE : ",mean_squared_error(y_train,X_train_pred_rfr))
print("R2 Score : ",r2_score(y_train,X_train_pred_rfr))

In [None]:
cross_rfr = cross_val_score(rfr_model,X_train,y_train,scoring="neg_mean_squared_error",cv=5)
mean_cross_rfr = np.mean(cross_rfr)
print(mean_cross_rfr)

In [None]:
X_test_pred_rfr = rfr_model.predict(X_test)
print("MSE : ",mean_squared_error(y_test,X_test_pred_rfr))
print("R2 Score : ",r2_score(y_test,X_test_pred_rfr))

In [None]:
pk.dump(rfr_model,open("Price_Predictor.pkl",'wb'))

In [None]:
pk.dump(sc,open("Scaler.pkl",'wb'))