# Mobile Price Classification



### Import data 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


In [None]:
train_data = pd.read_csv('train.csv')
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.isna().sum()

In [None]:
sns.heatmap(train_data.isna())

In [None]:
train_data.nunique()

In [None]:
train_data.describe()

### EDA

In [None]:
features_to_replace = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
mapping = {0: 'No', 1: 'Yes'}

train_data[features_to_replace] = train_data[features_to_replace].applymap(lambda x: mapping.get(x, x))
train_data.head()

In [None]:
df = train_data.copy(deep=True)
df.head()

In [None]:
columns = df.columns

def  univ(data, feature, size=20):
    plt.figure(figsize=(10,4))
    
    if data[feature].nunique() > size:
        plt.subplot(1,2,1)
        sns.histplot(data=data, x=feature, kde=True,stat='density',fill=True)
        plt.title(f'Histogram - {feature}')

        plt.subplot(1,2,2)
        sns.boxplot(data=data, y=feature)
        plt.title(f'Boxplot - {feature}')
    else:
        sns.countplot(data=data, x=feature)
        plt.title(f'Countplot - {feature}')

    plt.show()




In [None]:
def biv(data, feature, target='price_range', size=20):
    plt.figure(figsize=(10,4))
    
    if feature == target: pass
    if data[feature].nunique() > size:
        sns.boxplot(data=data,x=target, y=feature,palette='Set2')
        plt.title(f'Boxplot - {feature} by {target}')
    else:
        sns.countplot(data=data, x=feature, hue=target,palette='Set2')
        plt.title(f'Countplot - {target} by {feature}')

    plt.show()


In [None]:
def multiv(data, vars, target='price_range'):
    plt.figure(figsize=(10,4))
    sns.pairplot(data=data, vars=vars, hue='price_range',palette='Set2')
    plt.suptitle(f'Features:{vars} by price', y=1.02)
    plt.show()

In [None]:
for feature in features:
    biv(df, feature)

In [None]:

columns = ['fc', 'pc', 'px_height', 'px_width']
multiv(df,columns)



In [None]:
cols = ['px_width', 'px_height', 'sc_w', 'sc_h', 'mobile_wt']
multiv(df,cols)


In [None]:

cols = ['battery_power', 'ram', 'int_memory', 'mobile_wt']
multiv(df, cols)

In [None]:
import os

def multiv(data, vars, target='price_range', save_folder='saved_images'):
    os.makedirs(save_folder, exist_ok=True)
    
    plt.figure(figsize=(10,4))
    sns.pairplot(data=data, vars=vars, hue=target, palette='Set2')
    plt.suptitle(f'Features: {vars} by {target}', y=1.02)
    plt.savefig(f"{save_folder}/pairplot_{','.join(vars)}.png")  
    plt.show()

columns = ['fc', 'pc', 'px_height', 'px_width']
multiv(df, columns)

cols1 = ['px_width', 'px_height', 'sc_w', 'sc_h', 'mobile_wt']
multiv(df, cols1)

cols2 = ['battery_power', 'ram', 'int_memory', 'mobile_wt']
multiv(df, cols2)

In [None]:

# Visualize the distribution of the target variable
sns.countplot(x='price_range', data=train_data)
plt.title('Distribution of Price Range')
plt.show()

In [None]:
train_data['price_range'].value_counts()

we have the counts numbers in the differents categories of the target variable

In [None]:
# Define numerical and categorical features
numerical_features = ['battery_power', 'clock_speed', 'fc', 'int_memory', 'm_dep', 'mobile_wt',
                      'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time']
categorical_features = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']


In [None]:
# Visualize distributions of numerical features
plt.figure(figsize=(15, 20))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(5, 3, i)
    sns.histplot(train_data[feature], kde=True)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

In [None]:
# Visualize distributions of categorical features
plt.figure(figsize=(15, 6))
for i, feature in enumerate(categorical_features, 1):
    plt.subplot(2, 3, i)
    sns.countplot(x=feature, data=train_data)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

In [None]:
# Visualize relationships between numerical features and target variable
plt.figure(figsize=(15, 20))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(5, 3, i)
    sns.boxplot(x='price_range', y=feature, data=train_data)
    plt.title(f'Relationship between {feature} and Price Range')
plt.tight_layout()
plt.show()

In [None]:
import os

# Create a directory to save the images
os.makedirs("saved_images", exist_ok=True)

# Visualize relationships between numerical features and target variable
plt.figure(figsize=(15, 20))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(5, 3, i)
    sns.boxplot(x='price_range', y=feature, data=train_data)
    plt.title(f'Relationship between {feature} and Price Range')
    plt.savefig(f"saved_images/{feature}_vs_price_range.png")  # Save individual images
plt.tight_layout()
plt.savefig("saved_images/all_relationships.png")  # Save one image with all diagrams
plt.show()

In [None]:
# Visualize relationships between categorical features and target variable
plt.figure(figsize=(15, 6))
for i, feature in enumerate(categorical_features, 1):
    plt.subplot(2, 3, i)
    sns.countplot(x=feature, hue='price_range', data=train_data)
    plt.title(f'Relationship between {feature} and Price Range')
plt.tight_layout()
plt.show()

#### Observations
- A higher battery power tends to be associated with a more advanced class of mobile device.
- Although mobile weight does have an impact, its influence on the device class is relatively smaller compared to other factors.
- Pixel width (px_width) is a significant factor in determining the class of mobile devices.
- RAM (ram) is a critical determinant in the classification of mobile devices.
- The majority of mobile devices support 3G connectivity.
- Pixel height and width (px_height and px_width) are correlated with the class of mobile devices.
- Screen height and width (sc_h and sc_w) also show a correlation with the class of mobile devices.

In [None]:
corr_matrix = df.select_dtypes(include='number').corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap='viridis', fmt=".2f", linewidths=1)  
plt.title('Modified Correlation Matrix')

plt.show()

- There exists a correlation between (ram) and price_range.

- There exists a correlation between the resolutions of the primary camera (pc) and the front camera (fc).


### Feature engineering

In [None]:
df_preprocessing =  train_data.copy(deep=True)
X = df_preprocessing.drop(columns=['price_range'])
y = df_preprocessing['price_range']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)

In [None]:
num_features = X_train.select_dtypes(include='number').columns
cat_features = X_train.select_dtypes(include='object').columns

num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, num_features),
        ('cat_pipeline', cat_pipeline, cat_features)
    ],
    remainder='drop'
)

In [None]:
X_train_transformed_arr = preprocessor.fit_transform(X_train)
y_train_arr = np.array(y_train)
X_val_transformed_arr = preprocessor.transform(X_val)
y_val_arr = np.array(y_val)

###  modeling

In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Trees': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machines (SVM)': SVC(),
    'Naive Bayes': GaussianNB(),
    
}

In [None]:
from sklearn.model_selection import cross_val_score
best_model_score = -1
best_model_name = None
best_model = None

for name, model in models.items():
    model.fit(X_train_transformed_arr, y_train_arr)
    y_pred = model.predict(X_val_transformed_arr)
    cv_scores = cross_val_score(model, X_train_transformed_arr, y_train_arr, cv=5, scoring='f1_micro')
    avg_cv_score = np.mean(cv_scores)
    
    print(f"Cross-validation scores for {name}: {cv_scores}")
    print(f"Average CV score for {name}: {avg_cv_score}")
    if avg_cv_score > best_model_score:
        best_model_score = avg_cv_score
        best_model_name = name
        best_model = model

y_pred = best_model.predict(X_val_transformed_arr)
report = classification_report(y_val_arr, y_pred)

print(f"Best Model: {best_model_name}")
print(report)

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_val_arr, y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_val_arr, y_pred), annot=True, fmt='d', cmap='Blues', cbar=False)


In [None]:
test_data = pd.read_csv("test.csv")
test_data.head()

In [None]:
test_data=test_data.drop('id',axis=1)

In [None]:
label = ['price_range']

In [None]:
features_to_replace = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
mapping = {0: 'No', 1: 'Yes'}

test_data[features_to_replace] = test_data[features_to_replace].applymap(lambda x: mapping.get(x, x))
test_data.head()

In [None]:

test_transformed = preprocessor.transform(test_data)


In [None]:
test = pd.read_csv("test.csv")


In [None]:
result = pd.DataFrame(best_model.predict(test_transformed))
final = test.merge(result, left_index = True, right_index = True)
final = final.rename(columns = {0:'price_range'})
final.to_csv('result.csv')

In [None]:
final_predictions = pd.read_csv("result.csv")
final_predictions.head()