In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae', low_memory=False)
#describe the dataset
df.describe()

Unnamed: 0,year,country_code,crop_land,grazing_land,fishing_ground,built_up_land,carbon,total
count,72186.0,72186.0,51714.0,51714.0,51713.0,51713.0,51713.0,72177.0
mean,1990.263154,155.618915,19207720.0,13528750.0,10048600.0,1984737.0,30726090.0,77929600.0
std,16.018761,390.261979,168581900.0,129418000.0,108137500.0,17304880.0,398963000.0,745544300.0
min,1961.0,1.0,0.0,0.0,0.0,0.0,0.0,0.01876963
25%,1977.0,59.0,0.358108,0.1948302,0.08005409,0.03786298,0.0,1.894023
50%,1991.0,121.0,3.25181,9.978667,6.154094,0.2044437,0.0,8292.253
75%,2004.0,193.0,3610817.0,1734192.0,945000.0,386718.0,19.15736,15329590.0
max,2016.0,5001.0,3984702000.0,3417089000.0,2979605000.0,472616300.0,12571600000.0,20611820000.0


In [3]:
#print the first five rows of the table
df.head()

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,Armenia,1992,1,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,Armenia,1992,1,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A
3,Armenia,1992,1,BiocapTotGHA,550176.2427,465677.9722,289207.1078,47311.55172,114982.2793,0.0,1467355.0,3A
4,Armenia,1992,1,EFConsPerCap,0.38751,0.189462,1.26e-06,0.004165,0.033398,1.114093,1.728629,3A


In [55]:
#check distribution of target variable
df['QScore'].value_counts()

3A    51473
2A      240
Name: QScore, dtype: int64

In [56]:
#check the rows with missing values
df.isna().sum()

country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

In [57]:
#for simplicity, the rows with missing values is dropped
df = df.dropna()
df.isna().sum()

country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

In [58]:
'''An obvious change in our target variable after removing the missing values is that there are only three classes left
and from the distribution of the three classes there is an imbalance between the classes. There are methods to handle this
imbalance, such as oversampling and undersampling.
'''
#Oversampling involves increasing the number of instances in the class with fewer instances. While undersampling involves reducing the data points in the class with more instances
#For this example it will be converted to a binary classification problem by combining class '1A' and '2A'

df['QScore'] = df['QScore'].replace(['1A'], '2A')
df['QScore'].value_counts()

3A    51473
2A      240
Name: QScore, dtype: int64

In [59]:
df_2A = df[df.QScore=='2A']
df_3A = df[df.QScore=='3A'].sample(350)
data_df = df_2A.append(df_3A)

In [60]:
import sklearn.utils
data_df = sklearn.utils.shuffle(data_df)
data_df = data_df.reset_index(drop=True)

In [61]:
data_df.QScore.value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

In [62]:
#more processing
data_df = data_df.drop(columns=['country_code', 'country', 'year'])

In [63]:
X = data_df.drop(columns='QScore')
y = data_df['QScore']

In [64]:
#split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
y_train.value_counts()

3A    247
2A    166
Name: QScore, dtype: int64

In [65]:
#encode categorical variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X_train.record = encoder.fit_transform(X_train.record)
X_test.record = encoder.transform(X_test.record)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [66]:
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
X_train_balanced, y_balanced = smote.fit_sample(X_train, y_train)

In [67]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalised_train_df = scaler.fit_transform(X_train_balanced.drop(columns=['record']))
normalised_train_df=pd.DataFrame(normalised_train_df, 
 columns=X_train_balanced.drop(columns=[ 'record' ]).columns)
normalised_train_df[ 'record' ]=X_train_balanced[ 'record']

X_test=X_test.reset_index(drop=True)
normalised_test_df=scaler.transform(X_test.drop(columns=['record']))
normalised_test_df=pd.DataFrame(normalised_test_df, columns=X_test.drop(columns=['record']).columns)
normalised_test_df['record']=X_test['record']

In [68]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression()
log_reg.fit(normalised_train_df, y_balanced)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None, penalty=12, random_state=None, solver='lbfigs', tol=0.0001, verbose=0, warm_start=False)

LogisticRegression(penalty=12, solver='lbfigs')

# Measuring Classification Performance

### Cross Validation and Accuracy

In [69]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(log_reg, normalised_train_df, y_balanced, cv=5, scoring='f1_macro')
print(scores)

[0.43289689 0.54471129 0.51251964 0.49448529 0.47198276]


### KFold Cross Validation

In [92]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
kf=KFold(n_splits=5)
kf.split(normalised_train_df)
#create ans empty list
f1_scores = []
for train_index, test_index in kf.split(normalised_train_df):
    x_train, x_test = normalised_train_df.iloc[train_index], normalised_train_df.iloc[test_index]
    y_train, y_test = y_balanced[train_index], y_balanced[test_index]
model = LogisticRegression().fit(x_train, y_train)
#save result to the list
f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A')*100)
#print the list
print(f1_scores)

[0.0]


### Stratified K-Fold Cross Validation

In [93]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
#create an empty list
f1_scores = []
#run for every split
for train_index, test_index in skf.split(normalised_train_df, y_balanced):
    x_train, x_test = np.array(normalised_train_df)[train_index], np.array(normalised_train_df)[test_index]
    y_train, y_test = y_balanced[train_index], y_balanced[test_index]
model = LogisticRegression().fit(x_train, y_train)
#save result to the list
f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A') * 100)
#print the list
f1_scores

[65.21739130434783]

### Leave One Out Cross Validation

In [94]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(LogisticRegression(), normalised_train_df, y_balanced, cv=loo, scoring='f1_macro')
average_score = scores.mean() * 100
print(average_score)

44.93927125506073


In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### Confusion Matrix

In [99]:
#import the necessary libraries
from sklearn.metrics import  recall_score, accuracy_score, precision_score, confusion_matrix
new_predictions = log_reg.predict(normalised_test_df)
conf_mat = confusion_matrix(y_true=y_test, y_pred=new_predictions, labels=['2A', '3A'])
conf_mat

array([[46, 28],
       [67, 36]], dtype=int64)

# Measuring Performance

### Accuracy

In [100]:
accuracy = accuracy_score(y_true=y_test, y_pred=new_predictions)
print("Accuracy: {}".format(round(accuracy*100), 2))

Accuracy: 46


### Precision

In [101]:
precision = precision_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print("Precision: {}".format(round(accuracy*100), 2))

Precision: 46


### Recall

In [102]:
recall_score = recall_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print("Recall: {}".format(round(recall_score*100), 2))

Recall: 62


### F1-Score

In [103]:
f1 = f1_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print( 'F1:{}'.format(round(f1* 100 ),2))

F1:49


### Applying a decision tree classifier

In [104]:
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()
dec_tree_model = dec_tree.fit(normalised_train_df, y_balanced)
dec_tree_prediction = dec_tree_model.predict(normalised_test_df)

In [105]:
#measuring the performance of the decission tree classifier
cv_score = cross_val_score(dec_tree, normalised_train_df, y_balanced, cv=5, scoring='f1_macro')
print(cv_score)

[0.56286796 0.57536765 0.59549502 0.51556481 0.53564738]


In [106]:
#getting the f1 Score of the decision tree classifier
dec_tree_f1_score = f1_score(y_true=y_test, y_pred=dec_tree_prediction, pos_label='2A')
print("F1 Score for the decision tree: {}".format(round(dec_tree_f1_score*100), 2))

F1 Score for the decision tree: 47
