In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



In [2]:
# PART 1
# Loading the dataset
df_original = pd.read_csv("31-40.csv") 

In [3]:
# Converting 'cit_2017' to integer type
df_original["cit_2017"] = df_original["cit_2017"].astype(int)

In [4]:
# Encoding the 'Label' based on the ratio of 'cit_2022' to 'cit_2021'
# Low - 0, Medium - 1, High - 2
label_values = []
for i in range(len(df_original)):
    value_ratio  = df_original['cit_2022'][i]/df_original['cit_2021'][i]
    if value_ratio  < 1.05:
        label_values.append(0)
    elif value_ratio  > 1.06 and value_ratio  < 1.15:
        label_values.append(1)
    else:
        label_values.append(2)

df_original['Label'] = label_values



In [5]:
# Extracting relevant features and labels
data = df_original[['cit_2017', 'cit_2018', 'cit_2019', 'cit_2020', 'cit_2021','cit_2022']].copy()
label = df_original['Label'].copy()

In [6]:
# Splitting the dataset into training and testing sets (80-20)
X_train, X_test, Y_train, Y_test = train_test_split(data, label, test_size=0.2, random_state=9)

# Scaling the features using Min-Max Scaler
scaler_x = MinMaxScaler()
X_train_scaled = scaler_x.fit_transform(X_train)
X_test_scaled = scaler_x.transform(X_test)  

In [7]:
# Initializing and training the Random Forest Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train_scaled, Y_train)


RandomForestClassifier()

In [8]:
# Making predictions on the test set
Y_pred = rf_model.predict(X_test_scaled)


In [9]:
# Evaluating the model's accuracy
accuracy = accuracy_score(Y_test, Y_pred)

print(f"Model Accuracy with 06 features: {accuracy}")

Model Accuracy with 06 features: 0.65


In [None]:
# PART 2

In [15]:

# Creating a copy of the dataset for additional feature engineering
big_data = data.copy()

# Calculating and adding new features for each year
for year in range(2017, 2022):
    big_data[f'new_cit_{year}'] = ((df_original[f'cit_{year+1}'] - df_original[f'cit_{year}']) / df_original[f'cit_{year}']).round(3)

# Handling infinite values and replacing NaNs with the median

big_data.replace([np.inf, -np.inf], np.nan, inplace=True)   # Replacing infinity values with 'nan'
big_data.fillna(big_data.median(), inplace=True)            # Replacing 'nan' with median values
big_data

Unnamed: 0,cit_2017,cit_2018,cit_2019,cit_2020,cit_2021,cit_2022,new_cit_2017,new_cit_2018,new_cit_2019,new_cit_2020,new_cit_2021
0,7,22,51,70,108,137,2.143,1.318,0.373,0.543,0.269
1,3,7,15,97,119,133,1.333,1.143,5.467,0.227,0.118
2,939,888,794,843,840,755,-0.054,-0.106,0.062,-0.004,-0.101
3,46,76,75,67,59,58,0.652,-0.013,-0.107,-0.119,-0.017
4,129,151,108,95,85,85,0.171,-0.285,-0.120,-0.105,0.000
...,...,...,...,...,...,...,...,...,...,...,...
95,48,54,40,45,43,40,0.125,-0.259,0.125,-0.044,-0.070
96,532,535,497,547,504,505,0.006,-0.071,0.101,-0.079,0.002
97,36,52,57,82,152,222,0.444,0.096,0.439,0.854,0.461
98,242,299,346,357,423,496,0.236,0.157,0.032,0.185,0.173


In [11]:

# Splitting the dataset with new features
X_train_big, X_test_big, Y_train_big, Y_test_big = train_test_split(big_data, label, test_size=0.2, random_state=9)

# Scaling the features with new features
scaler_x_big = MinMaxScaler()
X_train_big_scaled = scaler_x_big.fit_transform(X_train_big)
X_test_big_scaled = scaler_x_big.transform(X_test_big)



In [12]:
# Initializing and training the Random Forest Model with new features
rf_model_big = RandomForestClassifier()
rf_model_big.fit(X_train_big_scaled, Y_train_big)



RandomForestClassifier()

In [13]:
# Making predictions on the test set with new features
Y_pred_new = rf_model_big.predict(X_test_big_scaled)

In [14]:
# Evaluating the model's accuracy with new features
accuracy_new = accuracy_score(Y_test_big, Y_pred_new)

# Displaying the accuracy with 11 features
print(f"Model Accuracy with 11 features: {accuracy_new}")

Model Accuracy with 11 features: 1.0
