In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)4
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/mobile-phone-price/Mobile phone price.csv',encoding='utf-8')
df

## Data Preprocessing

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
#remove duplicates
df.drop_duplicates(inplace = True)
df.shape

In [None]:
df.describe()

In [None]:
df['Screen Size (inches)'].value_counts()

In [None]:
df[df['Screen Size (inches)'] == '6.8 + 3.9']

In [None]:
df[df['Screen Size (inches)'] == '7.6 (unfolded)']

In [None]:
#data cleaning
df['Price ($)']= df['Price ($)'].str.replace('$', '')
df['Price ($)']=df['Price ($)'].str.replace(',','').str.strip()
df['Price ($)'] = pd.to_numeric(df['Price ($)'])

df['RAM '] = df['RAM '].str.replace('GB', '').str.strip()
df['RAM '] = pd.to_numeric(df['RAM '])

df['Storage '] = df['Storage '].str.replace('GB', '').str.strip()
df['Storage '] = pd.to_numeric(df['Storage '])

df.loc[88, 'Screen Size (inches)'] = 6.8
df.loc[373, 'Screen Size (inches)'] = 7.6
df['Screen Size (inches)'] = pd.to_numeric(df['Screen Size (inches)'])
df.info()

In [None]:
data = df.copy()

In [None]:
df.columns

## Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
df['Brand'] = le.fit_transform(df['Brand'])
df['Model'] = le.fit_transform(df['Model'])
df['Camera (MP)'] = le.fit_transform(df['Camera (MP)'])

In [None]:
df.describe()

In [None]:
# Before removing Outliers
df.plot(kind='box', subplots = True , figsize = (10,10) , layout = (3,3))

In [None]:
#remove outliers from storage
index = df[(df['Storage '] > 256)].index
df.drop(index= index, inplace=True)

#removing outliers from RAM
df.drop(index= df[(df['RAM '] > 12)].index, inplace=True)

#removing outliers from price
df.drop(index= df[(df['Price ($)'] > 1600)].index, inplace=True)

#removing outliers from battery capacity
df.drop(index= df[(df['Battery Capacity (mAh)'] > 6100) | (df['Battery Capacity (mAh)'] < 2500)].index, inplace=True)

#removing outliers from Screen size
df.drop(index= df[(df['Screen Size (inches)'] > 7)].index, inplace=True)

In [None]:
df.shape #data after preprocessing

In [None]:
data.shape #data before preprocessing

In [None]:
# After removing some outliers
df.plot(kind='box', subplots = True , figsize = (10,10) , layout = (3,3))

## Modeling

In [None]:
from sklearn.model_selection import train_test_split
x = df.drop(['Price ($)'] , axis = 1).values
y = df['Price ($)'].values
x_train , x_test , y_train ,y_test = train_test_split(x,y , test_size= 0.25 , random_state= 42)

In [None]:
df.shape

In [None]:
x_train.shape

In [None]:
x_test.shape

## Data Scaling

In [None]:
from sklearn.preprocessing import StandardScaler , MinMaxScaler , RobustScaler
import plotly.express as px

ro_scaler=RobustScaler()
x_train=ro_scaler.fit_transform(x_train)
x_test=ro_scaler.fit_transform(x_test)

## Linear regression

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
print(lr.score(x_train, y_train))
print(lr.score(x_test, y_test))

In [None]:
lr.coef_

In [None]:
lr.intercept_

In [None]:
pd.DataFrame(lr.coef_ , df.columns[:-1] , columns = ['coeficient'])

In [None]:
y_pred = lr.predict(x_test)
y_pred

In [None]:
df2 = pd.DataFrame({"Y_test" : y_test , "Y_predict": y_pred})
df2.head(10)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
reg_score = r2_score(y_test , y_pred)
mse = mean_squared_error(y_test , y_pred)
mape = mean_absolute_percentage_error(y_test , y_pred)
mae = mean_absolute_error(y_test , y_pred)

In [None]:
p = len(x_train[0])
n = len(y_train)
adj_R2 = 1-(reg_score)*(n-1)/(n-p-1)
adj_R2

In [None]:
print("R-squared: ", reg_score )
print("Mean Squared Error: ", mse)
print("Mean absolute percentage error: ", mape )
print("Mean absolute error: ", mae)

In [None]:
plt.figure(figsize= (12,6))
plt.plot(df2[:50])
plt.legend(["Actual" , " Predicted"])

In [None]:
import statsmodels.api as sm
x2 = sm.add_constant(x)
est = sm.OLS(y , x2)
est2 = est.fit()
print(est2.summary())

In [None]:
print(est2.rsquared_adj)

## Ridg Regression

In [None]:
from sklearn.linear_model import RidgeCV
rid = RidgeCV(alphas = 0.5)
rid.fit(x_train,y_train)

In [None]:
rid.score(x_train,y_train)

In [None]:
rid.score(x_test , y_test)

In [None]:
rid.coef_

In [None]:
rid.intercept_

In [None]:
pd.DataFrame(rid.coef_ , df.columns[:-1] , columns = ['coeficient'])

## Lasso Regression

In [None]:
from sklearn.linear_model import LassoLars
rid = LassoLars()
rid.fit(x_train,y_train)

In [None]:
rid.score(x_train,y_train)

In [None]:
rid.score(x_test , y_test)

In [None]:
rid.coef_

In [None]:
rid.intercept_

In [None]:
pd.DataFrame(rid.coef_ , df.columns[:-1] , columns = ['coeficient'])

## Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth= 4 , max_features= 6)
rf.fit(x_train,y_train)

In [None]:
rf.score(x_train,y_train)

In [None]:
rf.score(x_test , y_test)