###  1-Load and visualise the Dataset

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the datasets
train_df = pd.read_excel('train.xlsx',)
test_df = pd.read_excel('test.xlsx')

# Display the first few rows of the dataset
train_df.head()

# Display summary statistics
train_df.describe()

# Plot the distribution of the target variable
sns.countplot(x='price_range', data=train_df)
plt.title('Distribution of Price Range')
plt.xlabel('Price Range')
plt.ylabel('Count')
plt.show()


# Plot RAM vs. Price Range
plt.figure(figsize=(10, 6))
sns.boxplot(x='price_range', y='ram', data=train_df)
plt.title('RAM vs. Price Range')
plt.xlabel('Price Range')
plt.ylabel('RAM (MB)')
plt.show()

### 2-Prepare The Dataset

In [None]:
# Check for missing values
print(train_df.isnull().sum())

train_df = train_df.dropna()

# Feature Engineering
# Convert categorical variables to numerical
train_df['blue'] = train_df['blue'].astype(int)
train_df['dual_sim'] = train_df['dual_sim'].astype(int)
train_df['four_g'] = train_df['four_g'].astype(int)
train_df['three_g'] = train_df['three_g'].astype(int)
train_df['touch_screen'] = train_df['touch_screen'].astype(int)
train_df['wifi'] = train_df['wifi'].astype(int)

# Split the data
X = train_df.drop(['price_range'], axis=1)
y = train_df['price_range']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


### 3-Train The Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))


### 4-Optimize The Model

In [18]:
from sklearn.model_selection import GridSearchCV
import joblib

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and model evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

# Save the model
joblib.dump(best_model, 'device_price_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[[96 10  0  0]
 [ 7 77  5  0]
 [ 0 14 87  6]
 [ 0  0  3 94]]
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       106
           1       0.76      0.87      0.81        89
           2       0.92      0.81      0.86       107
           3       0.94      0.97      0.95        97

    accuracy                           0.89       399
   macro avg       0.89      0.89      0.89       399
weighted avg       0.89      0.89      0.89       399



['scaler.pkl']