In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [11]:
train_data = pd.read_excel("./Task1and2/train.xlsx")
test_data = pd.read_excel("./Task1and2/test.xlsx")

In [31]:
train_data.isnull().sum().sum()

0

In [13]:
# Split the data into training and testing sets
y = train_data["target"]
X = train_data.drop(columns=["target"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [14]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape


((29401, 18), (29401,), (7351, 18), (7351,))

In [15]:
# Normalize the data
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

In [16]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [17]:
X_train_norm.shape, y_train_encoded.shape

((29401, 18), (29401,))

In [18]:
# Using different models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=10),
    "Random Forest": RandomForestClassifier(random_state=10),
    "Support Vector Machine": SVC(random_state=10)
}

train_accuracies = {}
test_predictions = {}

for name, model in models.items():
    model.fit(X_train_norm, y_train_encoded)
    train_accuracy = model.score(X_train_norm, y_train_encoded)
    test_pred = model.predict(X_test_norm)

    train_accuracies[name] = train_accuracy
    test_predictions[name] = label_encoder.inverse_transform(test_pred)

train_accuracies, test_predictions

({'Logistic Regression': 0.9773136968130336,
  'Random Forest': 0.9995238257202136,
  'Support Vector Machine': 0.9898642903302609},
 {'Logistic Regression': array(['B54', 'A69', 'B72', ..., 'B40', 'B14', 'A76'], dtype=object),
  'Random Forest': array(['B54', 'A69', 'B72', ..., 'B40', 'B14', 'A76'], dtype=object),
  'Support Vector Machine': array(['B54', 'A69', 'B72', ..., 'B40', 'B14', 'A76'], dtype=object)})

In [19]:
# Predictions DataFrame
predictions = pd.DataFrame(columns=['Logistic_pred', 'Random_Forest_pred', 'SVM_pred', 'actual'])

predictions['Logistic_pred'] = test_predictions["Logistic Regression"]
predictions['Random_Forest_pred'] = test_predictions["Random Forest"]
predictions['SVM_pred'] = test_predictions["Support Vector Machine"]

In [24]:
predictions.drop(columns=['actual'])

Unnamed: 0,Logistic_pred,Random_Forest_pred,SVM_pred
0,B54,B54,B54
1,A69,A69,A69
2,B72,B72,B72
3,A21,A21,A21
4,B22,B22,B22
...,...,...,...
7346,B80,B80,B80
7347,B71,B71,B71
7348,B40,B40,B40
7349,B14,B14,B14


In [25]:
predictions['actual'] = y_test.to_numpy()

In [28]:
pd.options.display.max_rows = 4000
predictions

Unnamed: 0,Logistic_pred,Random_Forest_pred,SVM_pred,actual
0,B54,B54,B54,B54
1,A69,A69,A69,A69
2,B72,B72,B72,B72
3,A21,A21,A21,A21
4,B22,B22,B22,B22
...,...,...,...,...
7346,B80,B80,B80,B80
7347,B71,B71,B71,B71
7348,B40,B40,B40,B40
7349,B14,B14,B14,B14


In [33]:
# export predictions as xlsx
predictions.to_csv('predictions.csv', index=False)

In [32]:
import pickle as pkl

with open("models.pkl", "wb") as f:
    pkl.dump(models, f)