In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.utils import to_categorical



In [2]:
# Loading the dataset
data = pd.read_csv('31-40.csv')

In [3]:
# Preprocessing 
data['cit_2017'] = data['cit_2017'].astype(str).str.replace(',', '')
data['cit_2017'] = data['cit_2017'].astype('int')
data.dtypes

univ_rank         int64
first_initial    object
last_initial     object
cit_2017          int32
cit_2018          int64
cit_2019          int64
cit_2020          int64
cit_2021          int64
cit_2022          int64
h_index           int64
i_10_index        int64
dtype: object

In [4]:
# Selecting numerical features
numdata=data.select_dtypes(include=['number'])
numdata2=numdata.drop(['h_index','i_10_index', 'univ_rank'],axis=1)

In [5]:
# Creating a new column 'Ratio' based on the division of 'cit_2022' by 'cit_2021'
numdata2['Ratio'] = (numdata2['cit_2022'] / numdata2['cit_2021']).round(2)

In [6]:
# Classifying the 'Ratio' values into 'Low', 'Medium', or 'High'
def classify_value(value):
    if value < 1.05:
        return 'Low'
    elif 1.05 <= value <= 1.15:
        return 'Medium'
    else:
        return 'High'

In [7]:
# Applying the classification function to the 'Ratio' column
numdata2['Ratio']=numdata2['Ratio'].apply(classify_value)

In [8]:
# Splitting the dataset into features (X) and target variable (y)
import pandas as pd
X=numdata2.iloc[:,0:6]
y= pd.DataFrame(numdata2['Ratio'], columns = ['Ratio'])

In [9]:
# Mapping the categorical labels to numerical values
label_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
y['Ratio']= [label_mapping[label] for label in y['Ratio']]

In [10]:
# Scaling the features
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [11]:
X

array([[1.90217391e-03, 4.80526050e-03, 1.32872708e-02, 2.01612903e-02,
        3.07510349e-02, 3.34152334e-02],
       [8.15217391e-04, 1.01163379e-03, 3.72043582e-03, 2.79377880e-02,
        3.40035482e-02, 3.24324324e-02],
       [2.55163043e-01, 2.23823976e-01, 2.10736115e-01, 2.42799539e-01,
        2.47191011e-01, 1.85257985e-01],
       [1.25000000e-02, 1.84623166e-02, 1.96651608e-02, 1.92972350e-02,
        1.62625665e-02, 1.40049140e-02],
       [3.50543478e-02, 3.74304502e-02, 2.84347595e-02, 2.73617512e-02,
        2.39503253e-02, 2.06388206e-02],
       [3.80434783e-02, 4.80526050e-02, 5.52750465e-02, 7.45967742e-02,
        1.51981076e-01, 2.28255528e-01],
       [8.15217391e-04, 2.02326758e-03, 3.72043582e-03, 8.64055300e-03,
        1.09402720e-02, 2.01474201e-02],
       [1.29347826e-01, 1.41122914e-01, 1.41376561e-01, 2.01900922e-01,
        1.71200473e-01, 1.74447174e-01],
       [2.52717391e-02, 3.08548306e-02, 4.62397024e-02, 7.66129032e-02,
        8.16085157e-02, 

In [12]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [13]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Creating an AdaBoost classifier
adaboost = AdaBoostClassifier(n_estimators=800, learning_rate=0.5)

In [15]:
# Training the AdaBoost classifier
adaboost.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


AdaBoostClassifier(learning_rate=0.5, n_estimators=800)

In [16]:
# Making predictions on the test set
y_pred_1 = adaboost.predict(X_test)

In [17]:
# Evaluating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred_1)
print(f'Accuracy: {accuracy:.2%}')

Accuracy: 70.00%


In [20]:
from sklearn.metrics import confusion_matrix

# Displaying the confusion matrix
cmatrix = confusion_matrix(y_test, y_pred_1)
print("Confusion Matrix:")
print(cmatrix)

Confusion Matrix:
[[9 0 2]
 [1 1 0]
 [1 2 4]]


In [21]:
from sklearn.metrics import classification_report

# Displaying the classification report
target= ['Low', 'Medium', 'High']  
creport = classification_report(y_test, y_pred_1, target_names=target)
print("Classification Report:")
print(creport)

Classification Report:
              precision    recall  f1-score   support

         Low       0.82      0.82      0.82        11
      Medium       0.33      0.50      0.40         2
        High       0.67      0.57      0.62         7

    accuracy                           0.70        20
   macro avg       0.61      0.63      0.61        20
weighted avg       0.72      0.70      0.71        20

