In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
encodings_to_try = ['utf-8', 'latin1', 'ISO-8859-1']
for encoding in encodings_to_try:
    try:
        globaled = pd.read_csv("Global_Education.csv", encoding=encoding)
        break 
    except UnicodeDecodeError:
        print(f"Failed with encoding {encoding}. Trying the next one.")
print(globaled.head())

Failed with encoding utf-8. Trying the next one.
  Countries and areas  Latitude   Longitude  OOSR_Pre0Primary_Age_Male  \
0         Afghanistan  33.939110  67.709953                          0   
1             Albania  41.153332  20.168331                          4   
2             Algeria  28.033886   1.659626                          0   
3             Andorra  42.506285   1.521801                          0   
4              Angola  11.202692  17.873887                         31   

   OOSR_Pre0Primary_Age_Female  OOSR_Primary_Age_Male  \
0                            0                      0   
1                            2                      6   
2                            0                      0   
3                            0                      0   
4                           39                      0   

   OOSR_Primary_Age_Female  OOSR_Lower_Secondary_Age_Male  \
0                        0                              0   
1                        3               

In [3]:
features = ['Latitude', 'Longitude', 'OOSR_Pre0Primary_Age_Male', 'OOSR_Pre0Primary_Age_Female', 'OOSR_Primary_Age_Male', 'OOSR_Primary_Age_Female', 'OOSR_Lower_Secondary_Age_Male', 'OOSR_Lower_Secondary_Age_Female', 'OOSR_Upper_Secondary_Age_Male', 'OOSR_Upper_Secondary_Age_Female', 'Completion_Rate_Primary_Male', 'Completion_Rate_Primary_Female', 'Completion_Rate_Lower_Secondary_Male', 'Completion_Rate_Lower_Secondary_Female', 'Completion_Rate_Upper_Secondary_Male', 'Completion_Rate_Upper_Secondary_Female', 'Grade_2_3_Proficiency_Reading', 'Grade_2_3_Proficiency_Math', 'Primary_End_Proficiency_Reading', 'Primary_End_Proficiency_Math', 'Lower_Secondary_End_Proficiency_Reading', 'Lower_Secondary_End_Proficiency_Math', 'Youth_15_24_Literacy_Rate_Male', 'Youth_15_24_Literacy_Rate_Female', 'Birth_Rate', 'Gross_Primary_Education_Enrollment', 'Gross_Tertiary_Education_Enrollment']
X = globaled[features]
Y = globaled['Unemployment_Rate']

KeyError: "['Latitude'] not in index"

In [None]:
X = pd.get_dummies(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
rf_classifier.fit(X_train, y_train)

In [None]:
predictions = rf_classifier.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)


In [None]:
print(f'Accuracy: {accuracy:.2f}')
print('\nConfusion Matrix:')
print(conf_matrix)
print('\nClassification Report:')
print(classification_rep)

In [None]:
feature_importances = rf_classifier.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Feature Importances')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
from sklearn.tree import export_text

In [None]:
for tree_id in range(3):
    tree_rules = export_text(rf_classifier.estimators_[tree_id], feature_names=list(X.columns))
    print(f"Decision Tree {tree_id + 1}:\n{tree_rules}\n")


In [None]:
class_balance = airstrike['DAMAGE_LEVEL'].value_counts()
print("Class Balance:")
print(class_balance)


In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    # Add other hyperparameters to tune
}

In [None]:
# Grid Search for Hyperparameter Tuning
# from sklearn.model_selection import GridSearchCV
# grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
# grid_search.fit(X_train, y_train)
# best_params = grid_search.best_params_
# print("Best Hyperparameters:", best_params)


In [None]:
# Improved Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf_classifier.fit(X_train, y_train)

In [None]:
# Cross-validation
from sklearn.model_selection import cross_val_score
cross_val_scores = cross_val_score(rf_classifier, X_train, y_train, cv=5)
print("Cross-validation Scores:", cross_val_scores)


In [None]:
# Visualize Decision Tree
from sklearn.tree import export_graphviz
import graphviz

In [None]:
# Choose a tree to visualize (e.g., the first tree)
tree_to_visualize = rf_classifier.estimators_[0]

In [None]:
# Export as dot file
dot_data = export_graphviz(tree_to_visualize, out_file=None, feature_names=list(X.columns), class_names=list(map(str, y.unique())), filled=True, rounded=True)

In [None]:
# Visualize the graph
graph = graphviz.Source(dot_data)
graph.render("decision_tree")
graph.view("decision_tree")