## Mobile Price Classification

## Importing toolkit

In [None]:
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import ConfusionMatrixDisplay , classification_report
from sklearn.model_selection import train_test_split

## Data preparation

In [None]:
df_train = pd.read_csv("train.csv")
df_train.T

## Data Exploration

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.nunique()

## We want to calculate the relative frequencies of the classes

In [None]:
df_train['price_range'].value_counts(normalize= True).plot(kind = 'bar')
plt.xlabel("Price Range classes")
plt.ylabel("Frequency")
plt.title("Classes balance");

## Now we show the correlation of the "Ram & Battery Power" column for both groups in the "Price Range" column

In [None]:
sns.boxplot(x='price_range' , y='ram' , data=df_train)
plt.xlabel("Price Range classes")
plt.ylabel("Ram")
plt.title("Distribution of Ram Ratio, by Class");

In [None]:
sns.boxplot(x='price_range' , y='battery_power' , data=df_train)
plt.xlabel("Price Range classes")
plt.ylabel("Battery Power")
plt.title("Distribution of Battery Power Ratio, by Class");

## 3G Supported phones

In [None]:
labels = ["3G-supported",'Not supported']
values = df_train['three_g'].value_counts().values
fig, ax = plt.subplots()
colors = ['orange', 'lightskyblue']
ax.pie(values, labels=labels, autopct='%1.1f%%',shadow=True,startangle=90,colors=colors)
plt.show();

## 4G supported phones  

In [None]:
labels = ["4G-supported",'Not supported']
values = df_train['four_g'].value_counts().values
fig1, ax1 = plt.subplots()
colors = ['orange', 'lightskyblue']
ax1.pie(values, labels=labels, autopct='%1.1f%%',shadow=True,startangle=90,colors=colors)
plt.show();

# Multicollinearity

In [None]:
corr = df_train.drop(columns='price_range').corr()
sns.heatmap(corr);

## Split

In [None]:
target = 'price_range'
X = df_train.drop(columns= [target])
y= df_train[target]
print(f"X Shape {X.shape}")
print(f"y Shape {y.shape}")

### Scale and Normalize Data

In [None]:
scalar = MinMaxScaler()

In [None]:
features = X.columns
X = scalar.fit_transform(X)

In [None]:
print(X)

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.2 , random_state=42)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

## Iterate

## 1-Logistic Regression

In [None]:
model_lr = LogisticRegression(max_iter=1000)

### Training model

In [None]:
model_lr.fit(X_train,y_train)

In [None]:
training_acc_lr= model_lr.score(X_train , y_train)
print(f"Training accuracy: {training_acc_lr}")

## Evaluate Logistic Regression model

In [None]:
testing_acc_lr= model_lr.score(X_test , y_test)
print(f"Testing accuracy: {testing_acc_lr}")

In [None]:
# Plot confusion matrix
ConfusionMatrixDisplay.from_estimator(
    model_lr,
    X_test,
    y_test
);

## 2-SVM

In [None]:
model_svc= SVC()

### Training model

In [None]:
model_svc.fit(X_train, y_train)

In [None]:
training_acc_svc = model_svc.score(X_train , y_train)
print(f"Testing accuracy: {training_acc_svc}")

## Evaluate SVM model

In [None]:
testing_acc_svc= model_svc.score(X_test , y_test)
print(f"Testing accuracy: {testing_acc_svc}")

In [None]:
# Plot confusion matrix
ConfusionMatrixDisplay.from_estimator(
    model_svc,
    X_test,
    y_test
);

In [None]:
models = pd.DataFrame({
    
    "Models": ["Logestic Regression" , "SVM"],
    "Score":[testing_acc_lr , testing_acc_svc]
    
})
models.sort_values(by="Score" , ascending=False)

## plot a bar char to show the best model performance

In [None]:
colors= ['orange' , 'blue']
sns.set_style('whitegrid')
plt.figure(figsize=(10,5))
sns.barplot(x=models['Models'],y=models['Score'], palette=colors )
plt.xlabel("Models")
plt.ylabel("Score")
plt.title("Model Selection")
plt.show();

### Logistic Regression got the highest accuracy

## Communication

In [None]:
# Extract importances from model
importances = model_lr.coef_[0]

odds_ratios = pd.Series(np.exp(importances) , index = features).sort_values()
odds_ratios.head()

## Horizontal bar chart, five largest coefficients

In [None]:
odds_ratios.tail().plot(kind= 'barh')
plt.xlabel("Odds Ratio");

## Horizontal bar chart, five smallest coefficients

In [None]:
odds_ratios.head().plot(kind= 'barh')
plt.xlabel("Odds Ratio");

## Thanks!