In [1]:
# Import Dependencies
import pandas as pd
from pathlib import Path
import sklearn

In [2]:
# Load Data
data = Path('Resources/edu_attain_final.csv')
edu_attain_raw = pd.read_csv(data)
edu_attain_raw

Unnamed: 0,country,years,subject,asab,val,total_pop,pop_with_given_ed_mil
0,AUS,2000,NONE,MEN,34.259043,9.443465,3.235241
1,AUS,2000,NONE,WOMEN,48.123203,9.585337,4.612771
2,AUS,2000,TRY,MEN,26.104818,9.443465,2.465199
3,AUS,2000,TRY,WOMEN,28.838318,9.585337,2.764250
4,AUS,2000,UPPSRY,MEN,39.636139,9.443465,3.743025
...,...,...,...,...,...,...,...
4863,ZAF,2020,NONE,WOMEN,52.301288,30.452449,15.927023
4864,ZAF,2020,TRY,MEN,14.797209,29.086248,4.303953
4865,ZAF,2020,TRY,WOMEN,16.851519,30.452449,5.131700
4866,ZAF,2020,UPPSRY,MEN,32.806484,29.086248,9.542175


In [3]:
# Binary Encooding on asab (gender) column
edu_attain = pd.get_dummies(edu_attain_raw, columns = ["asab"])
edu_attain

Unnamed: 0,country,years,subject,val,total_pop,pop_with_given_ed_mil,asab_MEN,asab_WOMEN
0,AUS,2000,NONE,34.259043,9.443465,3.235241,1,0
1,AUS,2000,NONE,48.123203,9.585337,4.612771,0,1
2,AUS,2000,TRY,26.104818,9.443465,2.465199,1,0
3,AUS,2000,TRY,28.838318,9.585337,2.764250,0,1
4,AUS,2000,UPPSRY,39.636139,9.443465,3.743025,1,0
...,...,...,...,...,...,...,...,...
4863,ZAF,2020,NONE,52.301288,30.452449,15.927023,0,1
4864,ZAF,2020,TRY,14.797209,29.086248,4.303953,1,0
4865,ZAF,2020,TRY,16.851519,30.452449,5.131700,0,1
4866,ZAF,2020,UPPSRY,32.806484,29.086248,9.542175,1,0


In [4]:
# Label Encoding on subject column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
edu_attain['subject'] = le.fit_transform(edu_attain['subject'])
edu_attain.head(10)

Unnamed: 0,country,years,subject,val,total_pop,pop_with_given_ed_mil,asab_MEN,asab_WOMEN
0,AUS,2000,0,34.259043,9.443465,3.235241,1,0
1,AUS,2000,0,48.123203,9.585337,4.612771,0,1
2,AUS,2000,1,26.104818,9.443465,2.465199,1,0
3,AUS,2000,1,28.838318,9.585337,2.76425,0,1
4,AUS,2000,2,39.636139,9.443465,3.743025,1,0
5,AUS,2000,2,23.038479,9.585337,2.208316,0,1
6,BEL,2000,0,41.293152,5.012012,2.069618,1,0
7,BEL,2000,0,41.662346,5.239235,2.182788,0,1
8,BEL,2000,1,26.565285,5.012012,1.331455,1,0
9,BEL,2000,1,27.610184,5.239235,1.446562,0,1


In [5]:
# Get list of all countires
edu_attain['country'].unique().tolist()

['AUS',
 'BEL',
 'CAN',
 'CHE',
 'CRI',
 'CZE',
 'DEU',
 'DNK',
 'ESP',
 'EST',
 'FIN',
 'FRA',
 'GBR',
 'GRC',
 'HUN',
 'IRL',
 'ITA',
 'JPN',
 'KOR',
 'LTU',
 'LUX',
 'LVA',
 'MEX',
 'NLD',
 'OAVG',
 'POL',
 'PRT',
 'SVK',
 'SVN',
 'SWE',
 'TUR',
 'USA',
 'ISR',
 'ISL',
 'ARG',
 'AUT',
 'IDN',
 'NOR',
 'BRA',
 'CHL',
 'CHN',
 'G20',
 'RUS',
 'ZAF',
 'IND',
 'COL',
 'NZL',
 'SAU']

In [6]:
# Remove OAVG rows
edu_attain = edu_attain[edu_attain['country'] != 'OAVG']
edu_attain

Unnamed: 0,country,years,subject,val,total_pop,pop_with_given_ed_mil,asab_MEN,asab_WOMEN
0,AUS,2000,0,34.259043,9.443465,3.235241,1,0
1,AUS,2000,0,48.123203,9.585337,4.612771,0,1
2,AUS,2000,1,26.104818,9.443465,2.465199,1,0
3,AUS,2000,1,28.838318,9.585337,2.764250,0,1
4,AUS,2000,2,39.636139,9.443465,3.743025,1,0
...,...,...,...,...,...,...,...,...
4863,ZAF,2020,0,52.301288,30.452449,15.927023,0,1
4864,ZAF,2020,1,14.797209,29.086248,4.303953,1,0
4865,ZAF,2020,1,16.851519,30.452449,5.131700,0,1
4866,ZAF,2020,2,32.806484,29.086248,9.542175,1,0


In [7]:
# Countries dictionary, excluding OAVG
countries = {
    'AUS': 1,
    'BEL': 2,
    'CAN': 3,
    'CHE': 4,
    'CRI': 5,
    'CZE': 6,
    'DEU': 7,
    'DNK': 8,
    'ESP': 9,
    'EST': 10,
    'FIN': 11,
    'FRA': 12,
    'GBR': 13,
    'GRC': 14,
    'HUN': 15,
    'IRL': 16,
    'ITA': 17,
    'JPN': 18,
    'KOR': 19,
    'LTU': 20,
    'LUX': 21,
    'LVA': 22,
    'MEX': 23,
    'NLD': 24,
    'POL': 25,
    'PRT': 26,
    'SVK': 27,
    'SVN': 28,
    'SWE': 29,
    'TUR': 30,
    'USA': 31,
    'ISR': 32,
    'ISL': 33,
    'ARG': 34,
    'AUT': 35,
    'IDN': 36,
    'NOR': 37,
    'BRA': 38,
    'CHL': 39,
    'CHN': 40,
    'G20': 41,
    'RUS': 42,
    'ZAF': 43,
    'IND': 44,
    'COL': 45,
    'NZL': 46,
    'SAU': 47
}

In [8]:
# Custom Encoding on country column using countries dictionary
edu_attain["country_num"] = edu_attain["country"].apply(lambda x: countries[x])
edu_attain

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,country,years,subject,val,total_pop,pop_with_given_ed_mil,asab_MEN,asab_WOMEN,country_num
0,AUS,2000,0,34.259043,9.443465,3.235241,1,0,1
1,AUS,2000,0,48.123203,9.585337,4.612771,0,1,1
2,AUS,2000,1,26.104818,9.443465,2.465199,1,0,1
3,AUS,2000,1,28.838318,9.585337,2.764250,0,1,1
4,AUS,2000,2,39.636139,9.443465,3.743025,1,0,1
...,...,...,...,...,...,...,...,...,...
4863,ZAF,2020,0,52.301288,30.452449,15.927023,0,1,43
4864,ZAF,2020,1,14.797209,29.086248,4.303953,1,0,43
4865,ZAF,2020,1,16.851519,30.452449,5.131700,0,1,43
4866,ZAF,2020,2,32.806484,29.086248,9.542175,1,0,43


In [9]:
# Drop country column
edu_attain = edu_attain.drop(['country'], axis = 1)
edu_attain

Unnamed: 0,years,subject,val,total_pop,pop_with_given_ed_mil,asab_MEN,asab_WOMEN,country_num
0,2000,0,34.259043,9.443465,3.235241,1,0,1
1,2000,0,48.123203,9.585337,4.612771,0,1,1
2,2000,1,26.104818,9.443465,2.465199,1,0,1
3,2000,1,28.838318,9.585337,2.764250,0,1,1
4,2000,2,39.636139,9.443465,3.743025,1,0,1
...,...,...,...,...,...,...,...,...
4863,2020,0,52.301288,30.452449,15.927023,0,1,43
4864,2020,1,14.797209,29.086248,4.303953,1,0,43
4865,2020,1,16.851519,30.452449,5.131700,0,1,43
4866,2020,2,32.806484,29.086248,9.542175,1,0,43


In [47]:
# Define features
x = edu_attain.copy()
x = x.drop("subject", axis = 1)
x.head()

Unnamed: 0,years,val,total_pop,pop_with_given_ed_mil,asab_MEN,asab_WOMEN,country_num
0,2000,34.259043,9.443465,3.235241,1,0,1
1,2000,48.123203,9.585337,4.612771,0,1,1
2,2000,26.104818,9.443465,2.465199,1,0,1
3,2000,28.838318,9.585337,2.76425,0,1,1
4,2000,39.636139,9.443465,3.743025,1,0,1


In [15]:
# Define the target set
y = edu_attain['subject'].ravel()
y[:12]

array([0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2])

In [17]:
# Split into Train & Test Sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1, stratify = y)

In [18]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3556, 7)
(1186, 7)
(3556,)
(1186,)


In [26]:
# Import Models
from sklearn import ensemble
from sklearn import naive_bayes
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [33]:
ml_list = {
    "LogisticRegression":sklearn.linear_model.LogisticRegression(solver='lbfgs', max_iter=200, random_state=1),
    "RandomForestClassifier": ensemble.RandomForestClassifier(),
    "GaussianNB": naive_bayes.GaussianNB(),
    "SVM": SVC(kernel = 'linear')
}

In [46]:
results = []
for model in ml_list:
    ml = ml_list[model]
    ml.fit(x_train, y_train)
    y_pred = ml.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({
        "name": model,
        "accuracy":accuracy
    })
    print(f"{model} - Accuracy: {accuracy}")

LogisticRegression - Accuracy: 0.5177065767284992
RandomForestClassifier - Accuracy: 0.8591905564924115
GaussianNB - Accuracy: 0.4671163575042159
SVM - Accuracy: 0.4924114671163575


In [35]:
results_df = pd.DataFrame(results).sort_values("accuracy", ascending=False)
results_df

Unnamed: 0,name,accuracy
1,RandomForestClassifier,0.858347
0,LogisticRegression,0.517707
3,SVM,0.492411
2,GaussianNB,0.467116


In [36]:
# Making predictions using the testing data
rf_model = ensemble.RandomForestClassifier()
rf_model = rf_model.fit(x_train, y_train)
predictions = rf_model.predict(x_test)

In [40]:
# Calculating the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2"], 
    columns=["Predicted 0", "Predicted 1", "Predicted 2"]
)

cm_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,353,29,17
Actual 1,34,323,42
Actual 2,12,36,340


In [39]:
# Accuracy Score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8566610455311973

In [41]:
# Display Classification Report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.88      0.88      0.88       399
           1       0.83      0.81      0.82       399
           2       0.85      0.88      0.86       388

    accuracy                           0.86      1186
   macro avg       0.86      0.86      0.86      1186
weighted avg       0.86      0.86      0.86      1186



In [48]:
# Calculate feature importance in the Random Forest model
importances = rf_model.feature_importances_

# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.31061344749208153, 'val'),
 (0.24189848602038208, 'pop_with_given_ed_mil'),
 (0.18733646840955792, 'total_pop'),
 (0.14837602543821923, 'country_num'),
 (0.08786547354347231, 'years'),
 (0.012145026417592245, 'asab_WOMEN'),
 (0.01176507267869448, 'asab_MEN')]