In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


train_data

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


In [None]:
test_data

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,0.5,1,4,1,61,0.8,191,...,12,746,857,3895,6,0,7,1,0,0
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,4,1270,1366,2396,17,10,10,0,1,1
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,20,295,1752,3893,10,0,7,1,1,0
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,18,749,810,1773,15,8,7,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,1700,1,1.9,0,0,1,54,0.5,170,...,17,644,913,2121,14,8,15,1,1,0
996,997,609,0,1.8,1,0,0,13,0.9,186,...,2,1152,1632,1933,8,1,19,0,1,1
997,998,1185,0,1.4,0,1,1,8,0.5,80,...,12,477,825,1223,5,0,14,1,0,0
998,999,1533,1,0.5,1,0,0,50,0.4,171,...,12,38,832,2509,15,11,6,0,1,0


**Check for missing values**: If this code prints out any numbers greater than 0, that means we have missing values. We could fill in these missing values with the mean of the column like this:

**X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_train.mean())**



In [None]:
print(train_data.isnull().sum())
print(train_data.columns.shape)


battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64
(21,)


In [None]:
print(test_data.isnull().sum())
print(test_data.columns.shape)

id               0
battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
dtype: int64
(21,)


In [None]:
# Separating features (X) and target variable (y)
X_train = train_data.drop('price_range', axis=1) #Independent Variable
y_train = train_data['price_range'] #Dependent Variable
X_train.columns


Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi'],
      dtype='object')

In [None]:
# Making sure that the test data only includes the same features as the training data
X_test = test_data[X_train.columns]
X_test

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1043,1,1.8,1,14,0,5,0.1,193,3,16,226,1412,3476,12,7,2,0,1,0
1,841,1,0.5,1,4,1,61,0.8,191,5,12,746,857,3895,6,0,7,1,0,0
2,1807,1,2.8,0,1,0,27,0.9,186,3,4,1270,1366,2396,17,10,10,0,1,1
3,1546,0,0.5,1,18,1,25,0.5,96,8,20,295,1752,3893,10,0,7,1,1,0
4,1434,0,1.4,0,11,1,49,0.5,108,6,18,749,810,1773,15,8,7,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1700,1,1.9,0,0,1,54,0.5,170,7,17,644,913,2121,14,8,15,1,1,0
996,609,0,1.8,1,0,0,13,0.9,186,4,2,1152,1632,1933,8,1,19,0,1,1
997,1185,0,1.4,0,1,1,8,0.5,80,1,12,477,825,1223,5,0,14,1,0,0
998,1533,1,0.5,1,0,0,50,0.4,171,2,12,38,832,2509,15,11,6,0,1,0


In [None]:
# Creating a Logistic Regression model
model = LogisticRegression()

# Training the model with the training data
model.fit(X_train, y_train)

# Making predictions on the test data
predictions = model.predict(X_test)

# Printing the predictions
print(predictions)

[2 3 1 3 2 2 3 1 3 0 3 3 0 0 2 0 1 1 2 1 0 3 1 2 3 0 2 0 3 1 1 0 3 0 0 1 3
 2 3 3 0 2 0 0 0 1 1 3 1 2 3 1 2 0 3 0 3 1 1 3 3 3 1 1 0 2 1 3 1 2 1 2 2 2
 2 0 3 0 2 3 0 3 3 0 3 0 3 1 3 1 1 2 2 0 2 3 0 2 1 3 3 0 0 2 1 1 1 1 3 3 3
 2 1 3 3 2 3 1 3 0 0 3 3 1 1 0 3 3 2 1 0 2 1 1 3 1 1 0 3 2 1 2 1 2 3 3 3 3
 1 3 2 3 1 0 2 2 3 3 3 3 3 2 3 3 3 3 1 0 3 0 0 0 1 1 0 2 1 0 2 2 0 0 1 1 2
 2 1 1 0 0 0 0 0 3 1 0 2 3 3 3 1 3 2 3 2 1 2 0 1 0 2 2 1 2 2 3 1 3 0 3 2 2
 2 1 0 1 0 2 0 1 0 2 1 0 3 0 3 0 3 1 1 0 0 3 0 3 2 3 1 1 3 0 0 2 2 3 1 3 1
 1 3 3 2 3 3 3 3 2 0 2 1 2 2 1 3 2 0 3 0 2 1 0 0 3 2 2 3 2 0 3 3 1 2 1 2 1
 1 0 2 3 1 0 0 3 0 2 0 1 1 0 1 3 2 3 2 2 0 2 0 0 0 1 3 3 0 0 0 3 3 1 2 3 1
 2 3 3 3 2 3 3 3 2 3 2 3 0 2 0 2 1 2 1 2 2 1 1 1 3 1 3 1 3 0 0 0 0 3 0 1 2
 1 1 1 3 2 0 2 1 0 3 2 0 3 2 1 2 2 2 3 1 1 3 2 1 2 0 0 1 0 3 3 0 0 2 0 0 1
 1 0 1 0 3 3 3 3 3 0 2 1 3 1 1 1 1 3 0 3 3 3 3 1 2 1 2 2 3 2 1 3 3 3 1 0 0
 0 1 3 1 1 3 3 0 2 3 0 0 3 3 1 0 2 2 3 0 2 0 1 3 2 3 1 2 0 3 2 0 3 1 0 0 3
 1 3 2 3 3 2 3 1 1 2 3 3 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'price_range': predictions
})
submission_df

Unnamed: 0,id,price_range
0,1,2
1,2,3
2,3,1
3,4,3
4,5,2
...,...,...
995,996,1
996,997,1
997,998,2
998,999,1


In [None]:
submission_df.to_csv('submission.csv', index=False)
