In [146]:
# Importing the required dependencies or libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import plotly
import plotly.express as px
import plotly.offline as pyo
import cufflinks as cf
import sklearn.utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
pyo.init_notebook_mode(connected=True)
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
cf.go_offline()
%matplotlib inline

In [147]:
df = pd.read_csv('./Data/NFA_public_data_2019.csv', low_memory=False)

In [148]:
df['QScore'].value_counts()

3A    51481
2A    10576
2B    10096
1B       16
1A       16
Name: QScore, dtype: int64

In [149]:
df.isna().sum()

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

In [150]:
df = df.dropna()

In [151]:
df.isna().sum()

country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

In [152]:
df['QScore'].value_counts()

3A    51473
2A      224
1A       16
Name: QScore, dtype: int64

In [153]:
df['QScore'] = df['QScore'].replace(['1A'], '2A')

In [154]:
df.QScore.value_counts() 

3A    51473
2A      240
Name: QScore, dtype: int64

In [155]:
df_2A = df[df.QScore == '2A']
df_3A = df[df.QScore == '3A'].sample(350)

In [156]:
data_df = df_2A.append(df_3A)

In [157]:
data_df = sklearn.utils.shuffle(data_df)
data_df = data_df.reset_index(drop=True)

In [158]:
data_df.shape

(590, 12)

In [159]:
data_df.QScore.value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

In [160]:
data_df = data_df.drop(columns=['country_code', 'country', 'year'])
X = data_df.drop(columns='QScore')
y = data_df['QScore']

In [161]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 

In [162]:
y_train.value_counts() 

3A    250
2A    163
Name: QScore, dtype: int64

In [163]:
encoder = LabelEncoder()
x_train.record = encoder.fit_transform(x_train.record)
x_test.record = encoder.transform(x_test.record)

In [164]:
smote = SMOTE(random_state= 1 )
x_train_balanced, y_balanced = smote.fit_sample(x_train, y_train)

In [165]:
scaler = MinMaxScaler()
normalised_train_df = scaler.fit_transform(x_train_balanced.drop(columns=['record']))
normalised_train_df = pd.DataFrame(normalised_train_df, columns=x_train_balanced.drop(columns=['record']).columns)
normalised_train_df['record'] = x_train_balanced['record'] 

In [166]:
x_test = x_test.reset_index(drop=True)
normalised_test_df = scaler.transform(x_test.drop(columns=['record']))
normalised_test_df = pd.DataFrame(normalised_test_df, columns=x_test.drop(columns=['record']).columns)
normalised_test_df['record'] = x_test['record']

In [167]:
log_reg = LogisticRegression(max_iter=1000000)
log_reg.fit(normalised_train_df, y_balanced)

LogisticRegression(max_iter=1000000)

In [168]:
scores = cross_val_score(log_reg, normalised_train_df, y_balanced, cv=5, scoring='f1_macro')
scores

array([0.50392724, 0.52957662, 0.50758718, 0.54778414, 0.57555178])

In [169]:
# kf = KFold(n_splits=5)
# kf.split(normalised_train_df)
# f1_scores = []
# #run for every split
# for train_index, test_index in kf.split(normalised_train_df):
#     x_train, x_test = normalised_train_df.iloc[train_index], normalised_train_df.iloc[test_index]
#     y_train, y_test = y_balanced[train_index], y_balanced[test_index]
#     model = LogisticRegression().fit(x_train, y_train)
#     f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A')*100)
# f1_scores

In [170]:
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# f1_scores = []
# #run for every split
# for train_index, test_index in skf.split(normalised_train_df, y_balanced):
#     x_train, x_test = np.array(normalised_train_df)[train_index], np.array(normalised_train_df)[test_index]
#     y_train, y_test = y_balanced[train_index], y_balanced[test_index]
#     model = LogisticRegression().fit(x_train, y_train)
#     #save result to list
#     f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A')*100)
# f1_scores

In [171]:
# loo = LeaveOneOut()
# scores = cross_val_score(LogisticRegression(), normalised_train_df, y_balanced, cv=loo, scoring= 'f1_macro')
# average_score = scores.mean()*100
# scores

In [172]:
# new_predictions = log_reg.predict(normalised_test_df)
# cnf_mat = confusion_matrix(y_true=y_test, y_pred=new_predictions, labels=['2A', '3A'])
# cnf_mat

In [173]:
accuracy = accuracy_score(y_true=y_test, y_pred=new_predictions)
print( 'Accuracy: {}' .format(round(accuracy* 100 ), 2 ))

Accuracy: 51


In [174]:
precision = precision_score(y_true=y_test, y_pred=new_predictions, pos_label= '2A' )
print( 'Precision: {}' .format(round(precision* 100 ), 2 )) 

Precision: 46


In [175]:
recall = recall_score(y_true=y_test, y_pred=new_predictions, pos_label= '2A' )
print( 'Recall: {}' .format(round(recall* 100 ), 2 ))

Recall: 70


In [176]:
f1 = f1_score(y_true=y_test, y_pred=new_predictions, pos_label= '2A' )
print( 'F1: {}' .format(round(f1* 100 ), 2 ))

F1: 56


In [177]:
dec_tree = DecisionTreeClassifier()
dec_tree.fit(normalised_train_df, y_balanced)

DecisionTreeClassifier()