In [12]:
import pandas as pd

# Load the datasets
train_data = pd.read_excel('Task1and2/train.xlsx')
test_data = pd.read_excel('Task1and2/test.xlsx')

# Inspect the datasets
print("Training Data:")
print(train_data.head())
print("\nTest Data:")
print(test_data.head())


Training Data:
   T1  T2  T3  T4  T5  T6  T7  T8  T9  T10  T11  T12  T13  T14  T15  T16  T17  \
0 -70 -61 -66 -53 -51 -63 -82 -57 -76  -78  -66  -66  -61  -59  -73  -75  -63   
1 -77 -74 -71 -76 -65 -63 -66 -52 -55  -75  -72  -75  -74  -61  -64  -63  -53   
2 -53 -38 -55 -66 -62 -62 -65 -70 -62  -52  -56  -53  -66  -68  -72  -60  -68   
3 -72 -62 -59 -65 -65 -65 -78 -82 -83  -59  -84  -60  -64  -83  -69  -72  -95   
4 -67 -69 -65 -63 -59 -53 -70 -72 -71  -60  -61  -57  -54  -76  -61  -66  -71   

   T18 target  
0  -77    B37  
1  -63    B61  
2  -77    A19  
3  -73    A22  
4  -80    A33  

Test Data:
   T1  T2  T3  T4  T5  T6  T7  T8  T9  T10  T11  T12  T13  T14  T15  T16  T17  \
0 -76 -83 -70 -66 -64 -72 -64 -69 -60  -76  -83  -78  -81  -81  -81  -70  -60   
1 -58 -57 -78 -81 -73 -73 -78 -78 -82  -49  -55  -58  -66  -79  -72  -83  -74   
2 -70 -70 -71 -69 -69 -68 -61 -55 -53  -82  -87  -76  -68  -57  -64  -75  -57   
3 -71 -61 -56 -56 -61 -60 -68 -66 -72  -58  -55  -56  -58  -62  -6

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Separate features and target variable from training data
X_train = train_data.drop('target', axis=1)
y_train = train_data['target']

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Apply Label Encoding to categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    label_encoders[col] = le

# Apply the same transformation to test data
for col in categorical_cols:
    test_data[col] = label_encoders[col].transform(test_data[col])

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
test_data = imputer.transform(test_data)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
test_data = scaler.transform(test_data)

# Split the data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)



In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train_split, y_train_split)
y_pred_log_reg = log_reg.predict(X_val_split)
log_reg_accuracy = accuracy_score(y_val_split, y_pred_log_reg)

# Random Forest
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_split, y_train_split)
y_pred_rf = rf_clf.predict(X_val_split)
rf_accuracy = accuracy_score(y_val_split, y_pred_rf)

print(f"Logistic Regression Accuracy: {log_reg_accuracy}")
print(f"Random Forest Accuracy: {rf_accuracy}")


Logistic Regression Accuracy: 0.9706162426880697
Random Forest Accuracy: 0.9861243368249217


In [19]:
# Predict on the test data
test_predictions = rf_clf.predict(test_data)

# Load the original test data to get the index
original_test_data = pd.read_excel('Task1and2/test.xlsx')

In [21]:
output = pd.DataFrame({'Id': original_test_data.index, 'Predicted': test_predictions})
output.to_csv('Task1and2/test_predictions.csv', index=False)