# Military Power Clustering Project
-World military power
Source : globalfirepower.com on 1st may 2020

- https://data.world/vizzup/world-military-power/workspace/file?filename=World+military+power.xlsx


# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
import pyforest
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, StandardScaler, PowerTransformer, MinMaxScaler, RobustScaler
from sklearn.model_selection import KFold, cross_val_predict, train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.linear_model import LinearRegression, Lasso, Ridge,ElasticNet
from sklearn.metrics import plot_confusion_matrix, r2_score, mean_absolute_error, mean_squared_error, classification_report, confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import make_scorer, precision_score, precision_recall_curve, plot_precision_recall_curve, plot_roc_curve, roc_auc_score, roc_curve, f1_score, accuracy_score, recall_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, f_regression, mutual_info_regression
from xgboost import XGBRegressor, XGBClassifier
from xgboost import plot_importance
from sklearn.pipeline import Pipeline
from sklearn.tree import plot_tree
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

#importing plotly and cufflinks in offline mode
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

import warnings
warnings.filterwarnings('ignore')
warnings.warn("this will not show")
plt.rcParams["figure.figsize"] = (10,6)
pd.set_option('max_colwidth',200)

# pd.set_option('display.max_rows', 100) # if you wish to see more rows rather than default, just uncomment this line.
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import colorama
from colorama import Fore, Style  # maakes strings colored
# !pip3 install termcolor
from termcolor import colored

# User's Defined Function

In [2]:
# Function for determining the number and percentages of missing values

def missing (df):
    missing_number = df.isnull().sum().sort_values(ascending=False)
    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
    return missing_values

In [3]:
# To view summary information about the column

def first_looking(col):
    print("column name    : ", col)
    print("--------------------------------")
    print("per_of_nulls   : ", "%", round(df[col].isnull().sum()/df.shape[0]*100, 2))
    print("num_of_nulls   : ", df[col].isnull().sum())
    print("num_of_uniques : ", df[col].nunique())
    print(df[col].value_counts(dropna = False))

In [4]:
def train_val(y_train, y_train_pred, y_test, y_pred):
    
    scores = {"train_set": {"R2" : r2_score(y_train, y_train_pred),
    "mae" : mean_absolute_error(y_train, y_train_pred),
    "mse" : mean_squared_error(y_train, y_train_pred),                          
    "rmse" : np.sqrt(mean_squared_error(y_train, y_train_pred))},
    
    "test_set": {"R2" : r2_score(y_test, y_pred),
    "mae" : mean_absolute_error(y_test, y_pred),
    "mse" : mean_squared_error(y_test, y_pred),
    "rmse" : np.sqrt(mean_squared_error(y_test, y_pred))}}
    
    return pd.DataFrame(scores)

In [5]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    score = r2_score(actual, pred)
    return print("r2_score:", score, "\n","mae:", mae, "\n","mse:",mse, "\n","rmse:",rmse)

# Ingest Data  

In [6]:
df0 = pd.read_excel("World military power.xlsx", header=1)
df = df0.copy()

In [7]:
df0.head(1)

Unnamed: 0,Military Strength,Military Strength Power Index,Aircraft Strength,Aircraft Strength value,Fighter/Interceptor Strength,Fighter/Interceptor Strength value,Attack Aircraft Strength,Attack Aircraft Strength value,Transport Aircraft Fleet Strength,Transport Aircraft Fleet Strength value,Trainer Aircraft Fleet,Trainer Aircraft Fleet value,Helicopter Fleet Strength,Helicopter Fleet Strength value,Attack Helicopter Fleet Strength,Attack Helicopter Fleet Strength value,Tank Strength,Tank Strength value,AFV/APC Strength,AFV/APC Strength value,Self-Propelled Artillery Strength,Self-Propelled Artillery Strength value,Towed Artillery Strength,Towed Artillery Strength value,Rocket Projector Strength,Rocket Projector Strength value,Navy Fleet Strengths,Navy Fleet Strengths value,Aircraft Carrier Fleet Strength,Aircraft Carrier Fleet Strength value,Submarine Fleet Strength,Submarine Fleet Strength value,Destroyer Fleet Strength,Destroyer Fleet Strength value,Frigate Fleet Strength,Frigate Fleet Strength value,defense spending budget,defense spending budget value,External Debt,External Debt value,Airport Totals,Airport Totals value,Oil Production,Oil Production value,Oil Consumption,Oil Consumption value,Proven Oil Reserves,Proven Oil Reserves value,Available Manpower,Available Manpower value,Total Population,Total Population value,Total Square Land Area,Total Square Land Area value,Total Coastline Coverage,Total Coastline Coverage value,Total Waterway Coverage,Total Waterway Coverage value,Total Border Coverage,Total Border Coverage value
0,Afghanistan,1.344,Afghanistan,260,Afghanistan,0,Afghanistan,25,Afghanistan,30,Afghanistan,0,Afghanistan,187,Afghanistan,0,Afghanistan,0,Afghanistan,1062,Afghanistan,0,Afghanistan,176,Afghanistan,50,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,12000000000,Afghanistan,2840000000,Afghanistan,43,Afghanistan,0,Afghanistan,5500,Afghanistan,0,Afghanistan,14325743,Afghanistan,34940837,Afghanistan,652230,Afghanistan,0,Afghanistan,1200,Afghanistan,5987.0


# EDA

## Implement basic steps to see how is your data looks like

In [8]:
df.head(3)

Unnamed: 0,Military Strength,Military Strength Power Index,Aircraft Strength,Aircraft Strength value,Fighter/Interceptor Strength,Fighter/Interceptor Strength value,Attack Aircraft Strength,Attack Aircraft Strength value,Transport Aircraft Fleet Strength,Transport Aircraft Fleet Strength value,Trainer Aircraft Fleet,Trainer Aircraft Fleet value,Helicopter Fleet Strength,Helicopter Fleet Strength value,Attack Helicopter Fleet Strength,Attack Helicopter Fleet Strength value,Tank Strength,Tank Strength value,AFV/APC Strength,AFV/APC Strength value,Self-Propelled Artillery Strength,Self-Propelled Artillery Strength value,Towed Artillery Strength,Towed Artillery Strength value,Rocket Projector Strength,Rocket Projector Strength value,Navy Fleet Strengths,Navy Fleet Strengths value,Aircraft Carrier Fleet Strength,Aircraft Carrier Fleet Strength value,Submarine Fleet Strength,Submarine Fleet Strength value,Destroyer Fleet Strength,Destroyer Fleet Strength value,Frigate Fleet Strength,Frigate Fleet Strength value,defense spending budget,defense spending budget value,External Debt,External Debt value,Airport Totals,Airport Totals value,Oil Production,Oil Production value,Oil Consumption,Oil Consumption value,Proven Oil Reserves,Proven Oil Reserves value,Available Manpower,Available Manpower value,Total Population,Total Population value,Total Square Land Area,Total Square Land Area value,Total Coastline Coverage,Total Coastline Coverage value,Total Waterway Coverage,Total Waterway Coverage value,Total Border Coverage,Total Border Coverage value
0,Afghanistan,1.344,Afghanistan,260,Afghanistan,0,Afghanistan,25,Afghanistan,30,Afghanistan,0,Afghanistan,187,Afghanistan,0,Afghanistan,0,Afghanistan,1062,Afghanistan,0,Afghanistan,176,Afghanistan,50,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,12000000000,Afghanistan,2840000000,Afghanistan,43,Afghanistan,0,Afghanistan,5500,Afghanistan,0,Afghanistan,14325743,Afghanistan,34940837,Afghanistan,652230,Afghanistan,0,Afghanistan,1200,Afghanistan,5987.0
1,Albania,2.314,Albania,19,Albania,0,Albania,0,Albania,0,Albania,0,Albania,19,Albania,0,Albania,0,Albania,467,Albania,0,Albania,0,Albania,0,Albania,38.0,Albania,0.0,Albania,0.0,Albania,0.0,Albania,0.0,Albania,250000000,Albania,9505000000,Albania,4,Albania,16000,Albania,42500,Albania,168300000,Albania,1519438,Albania,3057220,Albania,28748,Albania,362,Albania,41,Albania,691.0
2,Algeria,0.466,Algeria,551,Algeria,103,Algeria,22,Algeria,59,Algeria,87,Algeria,257,Algeria,45,Algeria,880,Algeria,7361,Algeria,320,Algeria,240,Algeria,316,Algeria,201.0,Algeria,0.0,Algeria,6.0,Algeria,0.0,Algeria,5.0,Algeria,13000000000,Algeria,6260000000,Algeria,157,Algeria,1306000,Algeria,325000,Algeria,12200000000,Algeria,20741263,Algeria,41657488,Algeria,2381741,Algeria,998,Algeria,0,Algeria,6734.0


In [9]:
df.tail(3)

Unnamed: 0,Military Strength,Military Strength Power Index,Aircraft Strength,Aircraft Strength value,Fighter/Interceptor Strength,Fighter/Interceptor Strength value,Attack Aircraft Strength,Attack Aircraft Strength value,Transport Aircraft Fleet Strength,Transport Aircraft Fleet Strength value,Trainer Aircraft Fleet,Trainer Aircraft Fleet value,Helicopter Fleet Strength,Helicopter Fleet Strength value,Attack Helicopter Fleet Strength,Attack Helicopter Fleet Strength value,Tank Strength,Tank Strength value,AFV/APC Strength,AFV/APC Strength value,Self-Propelled Artillery Strength,Self-Propelled Artillery Strength value,Towed Artillery Strength,Towed Artillery Strength value,Rocket Projector Strength,Rocket Projector Strength value,Navy Fleet Strengths,Navy Fleet Strengths value,Aircraft Carrier Fleet Strength,Aircraft Carrier Fleet Strength value,Submarine Fleet Strength,Submarine Fleet Strength value,Destroyer Fleet Strength,Destroyer Fleet Strength value,Frigate Fleet Strength,Frigate Fleet Strength value,defense spending budget,defense spending budget value,External Debt,External Debt value,Airport Totals,Airport Totals value,Oil Production,Oil Production value,Oil Consumption,Oil Consumption value,Proven Oil Reserves,Proven Oil Reserves value,Available Manpower,Available Manpower value,Total Population,Total Population value,Total Square Land Area,Total Square Land Area value,Total Coastline Coverage,Total Coastline Coverage value,Total Waterway Coverage,Total Waterway Coverage value,Total Border Coverage,Total Border Coverage value
135,Yemen,1.241,Yemen,169,Yemen,77,Yemen,0,Yemen,8,Yemen,21,Yemen,61,Yemen,14,Yemen,620,Yemen,615,Yemen,20,Yemen,85,Yemen,150,Yemen,30.0,Yemen,0.0,Yemen,0.0,Yemen,0.0,Yemen,0.0,Yemen,1400000000,Yemen,7068000000,Yemen,57,Yemen,12260,Yemen,145000,Yemen,3000000000,Yemen,11266221,Yemen,28667230,Yemen,527968,Yemen,1906.0,Yemen,0,Yemen,1601.0
136,Zambia,1.646,Zambia,108,Zambia,18,Zambia,0,Zambia,11,Zambia,52,Zambia,27,Zambia,0,Zambia,75,Zambia,184,Zambia,0,Zambia,42,Zambia,50,Zambia,0.0,Zambia,0.0,Zambia,0.0,Zambia,0.0,Zambia,0.0,Zambia,40000000,Zambia,11660000000,Zambia,88,Zambia,0,Zambia,21000,Zambia,0,Zambia,6166905,Zambia,16445079,Zambia,752618,,,Zambia,2250,Zambia,6043.0
137,Zimbabwe,1.758,Zimbabwe,90,Zimbabwe,10,Zimbabwe,0,Zimbabwe,14,Zimbabwe,38,Zimbabwe,28,Zimbabwe,6,Zimbabwe,42,Zimbabwe,300,Zimbabwe,0,Zimbabwe,65,Zimbabwe,22,Zimbabwe,0.0,Zimbabwe,0.0,Zimbabwe,0.0,Zimbabwe,0.0,Zimbabwe,0.0,Zimbabwe,100000000,Zimbabwe,9357000000,Zimbabwe,196,Zimbabwe,0,Zimbabwe,19500,Zimbabwe,0,Zimbabwe,5584086,Zimbabwe,14030368,Zimbabwe,390757,,,Zimbabwe,0,Zimbabwe,3229.0


In [10]:
df.sample(10)

Unnamed: 0,Military Strength,Military Strength Power Index,Aircraft Strength,Aircraft Strength value,Fighter/Interceptor Strength,Fighter/Interceptor Strength value,Attack Aircraft Strength,Attack Aircraft Strength value,Transport Aircraft Fleet Strength,Transport Aircraft Fleet Strength value,Trainer Aircraft Fleet,Trainer Aircraft Fleet value,Helicopter Fleet Strength,Helicopter Fleet Strength value,Attack Helicopter Fleet Strength,Attack Helicopter Fleet Strength value,Tank Strength,Tank Strength value,AFV/APC Strength,AFV/APC Strength value,Self-Propelled Artillery Strength,Self-Propelled Artillery Strength value,Towed Artillery Strength,Towed Artillery Strength value,Rocket Projector Strength,Rocket Projector Strength value,Navy Fleet Strengths,Navy Fleet Strengths value,Aircraft Carrier Fleet Strength,Aircraft Carrier Fleet Strength value,Submarine Fleet Strength,Submarine Fleet Strength value,Destroyer Fleet Strength,Destroyer Fleet Strength value,Frigate Fleet Strength,Frigate Fleet Strength value,defense spending budget,defense spending budget value,External Debt,External Debt value,Airport Totals,Airport Totals value,Oil Production,Oil Production value,Oil Consumption,Oil Consumption value,Proven Oil Reserves,Proven Oil Reserves value,Available Manpower,Available Manpower value,Total Population,Total Population value,Total Square Land Area,Total Square Land Area value,Total Coastline Coverage,Total Coastline Coverage value,Total Waterway Coverage,Total Waterway Coverage value,Total Border Coverage,Total Border Coverage value
84,Nicaragua,2.275,Nicaragua,19,Nicaragua,0,Nicaragua,0,Nicaragua,5,Nicaragua,1,Nicaragua,14,Nicaragua,0,Nicaragua,104,Nicaragua,265,Nicaragua,0,Nicaragua,654,Nicaragua,163,Nicaragua,30.0,Nicaragua,0.0,Nicaragua,0.0,Nicaragua,0.0,Nicaragua,0.0,Nicaragua,140000000,Nicaragua,11310000000,Nicaragua,147,Nicaragua,0,Nicaragua,35000,Nicaragua,0,Nicaragua,3030436,Nicaragua,6085213,Nicaragua,130370,Nicaragua,910.0,Nicaragua,2220,Nicaragua,1253.0
52,Iraq,0.791,Iraq,348,Iraq,26,Iraq,33,Iraq,16,Iraq,102,Iraq,186,Iraq,40,Iraq,309,Iraq,4739,Iraq,44,Iraq,120,Iraq,30,Iraq,60.0,Iraq,0.0,Iraq,0.0,Iraq,0.0,Iraq,0.0,Iraq,1730000000,Iraq,73020000000,Iraq,102,Iraq,4454000,Iraq,825000,Iraq,142500000000,Iraq,16399240,Iraq,40194216,Iraq,438317,Iraq,58.0,Iraq,5279,Iraq,3809.0
39,Finland,0.85,Finland,194,Finland,55,Finland,0,Finland,11,Finland,105,Finland,127,Finland,0,Finland,200,Finland,2050,Finland,100,Finland,627,Finland,75,Finland,246.0,Finland,0.0,Finland,0.0,Finland,0.0,Finland,0.0,Finland,3570000000,Finland,150600000000,Finland,148,Finland,0,Finland,205000,Finland,0,Finland,2307420,Finland,5537364,Finland,338145,Finland,1250.0,Finland,7842,Finland,2563.0
7,Austria,0.957,Austria,120,Austria,15,Austria,0,Austria,11,Austria,32,Austria,62,Austria,0,Austria,56,Austria,467,Austria,33,Austria,0,Austria,0,Austria,0.0,,,,,,,,,Austria,3380000000,Austria,630800000000,Austria,52,Austria,14260,Austria,215000,Austria,43000000,Austria,4017691,Austria,8793370,Austria,83871,,,Austria,0,Austria,2524.0
134,Vietnam,0.356,Vietnam,293,Vietnam,77,Vietnam,0,Vietnam,38,Vietnam,36,Vietnam,138,Vietnam,25,Vietnam,2615,Vietnam,2530,Vietnam,70,Vietnam,1000,Vietnam,85,Vietnam,65.0,Vietnam,0.0,Vietnam,6.0,Vietnam,0.0,Vietnam,9.0,Vietnam,5500000000,Vietnam,96580000000,Vietnam,45,Vietnam,271400,Vietnam,525000,Vietnam,4400000000,Vietnam,51043216,Vietnam,97040334,Vietnam,331210,Vietnam,3444.0,Vietnam,17702,Vietnam,4616.0
13,Bhutan,10.168,Bhutan,2,Bhutan,0,Bhutan,0,Bhutan,0,Bhutan,0,Bhutan,2,Bhutan,0,Bhutan,0,Bhutan,27,Bhutan,0,Bhutan,0,Bhutan,0,Bhutan,0.0,,,,,,,,,Bhutan,25120000,Bhutan,2671000000,Bhutan,2,Bhutan,0,Bhutan,2000,Bhutan,0,Bhutan,176808,Bhutan,766397,Bhutan,38394,,,Bhutan,0,Bhutan,1136.0
40,France,0.17,France,1229,France,269,France,0,France,121,France,187,France,589,France,62,France,528,France,6028,France,109,France,12,France,13,France,180.0,France,4.0,France,9.0,France,11.0,France,11.0,France,41500000000,France,5360000000000,France,464,France,15170,France,1600000,France,72350000,France,30111868,France,67364357,France,643801,France,4853.0,France,8501,France,4072.0
49,India,0.095,India,2123,India,538,India,172,India,250,India,359,India,722,India,23,India,4292,India,8686,India,235,India,4060,India,266,India,285.0,India,1.0,India,16.0,India,10.0,India,13.0,India,61000000000,India,501600000000,India,346,India,733900,India,5000000,India,4621000000,India,622480340,India,1296834042,India,3287263,India,7000.0,India,14500,India,13888.0
135,Yemen,1.241,Yemen,169,Yemen,77,Yemen,0,Yemen,8,Yemen,21,Yemen,61,Yemen,14,Yemen,620,Yemen,615,Yemen,20,Yemen,85,Yemen,150,Yemen,30.0,Yemen,0.0,Yemen,0.0,Yemen,0.0,Yemen,0.0,Yemen,1400000000,Yemen,7068000000,Yemen,57,Yemen,12260,Yemen,145000,Yemen,3000000000,Yemen,11266221,Yemen,28667230,Yemen,527968,Yemen,1906.0,Yemen,0,Yemen,1601.0
78,Mozambique,2.336,Mozambique,16,Mozambique,8,Mozambique,0,Mozambique,1,Mozambique,1,Mozambique,6,Mozambique,2,Mozambique,100,Mozambique,335,Mozambique,0,Mozambique,260,Mozambique,12,Mozambique,8.0,Mozambique,0.0,Mozambique,0.0,Mozambique,0.0,Mozambique,0.0,Mozambique,245000000,Mozambique,10910000000,Mozambique,98,Mozambique,0,Mozambique,22500,Mozambique,0,Mozambique,4629744,Mozambique,27233789,Mozambique,799380,Mozambique,2470.0,Mozambique,460,Mozambique,4783.0


In [11]:
df.columns

Index(['Military Strength', 'Military Strength Power Index',
       'Aircraft Strength', 'Aircraft Strength value',
       'Fighter/Interceptor Strength', 'Fighter/Interceptor Strength value',
       'Attack Aircraft Strength', 'Attack Aircraft Strength value',
       'Transport Aircraft Fleet Strength',
       'Transport Aircraft Fleet Strength value', 'Trainer Aircraft Fleet',
       'Trainer Aircraft Fleet value', 'Helicopter Fleet Strength',
       'Helicopter Fleet Strength value', 'Attack Helicopter Fleet Strength',
       'Attack Helicopter Fleet Strength value', 'Tank Strength',
       'Tank Strength value', 'AFV/APC Strength', 'AFV/APC Strength value',
       'Self-Propelled Artillery Strength',
       'Self-Propelled Artillery Strength value', 'Towed Artillery Strength',
       'Towed Artillery Strength value', 'Rocket Projector Strength',
       'Rocket Projector Strength value', 'Navy Fleet Strengths',
       'Navy Fleet Strengths value', 'Aircraft Carrier Fleet Strength',


In [12]:
df.columns = df.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')

In [13]:
df.columns

Index(['military_strength', 'military_strength_power_index',
       'aircraft_strength', 'aircraft_strength_value',
       'fighter/interceptor_strength', 'fighter/interceptor_strength_value',
       'attack_aircraft_strength', 'attack_aircraft_strength_value',
       'transport_aircraft_fleet_strength',
       'transport_aircraft_fleet_strength_value', 'trainer_aircraft_fleet',
       'trainer_aircraft_fleet_value', 'helicopter_fleet_strength',
       'helicopter_fleet_strength_value', 'attack_helicopter_fleet_strength',
       'attack_helicopter_fleet_strength_value', 'tank_strength',
       'tank_strength_value', 'afv/apc_strength', 'afv/apc_strength_value',
       'self-propelled_artillery_strength',
       'self-propelled_artillery_strength_value', 'towed_artillery_strength',
       'towed_artillery_strength_value', 'rocket_projector_strength',
       'rocket_projector_strength_value', 'navy_fleet_strengths',
       'navy_fleet_strengths_value', 'aircraft_carrier_fleet_strength',


In [14]:
df.shape

(138, 60)

In [15]:
print("There is", df.shape[0], "observation and", df.shape[1], "columns in the dataset")

There is 138 observation and 60 columns in the dataset


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 60 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   military_strength                        138 non-null    object 
 1   military_strength_power_index            138 non-null    float64
 2   aircraft_strength                        138 non-null    object 
 3   aircraft_strength_value                  138 non-null    int64  
 4   fighter/interceptor_strength             138 non-null    object 
 5   fighter/interceptor_strength_value       138 non-null    int64  
 6   attack_aircraft_strength                 138 non-null    object 
 7   attack_aircraft_strength_value           138 non-null    int64  
 8   transport_aircraft_fleet_strength        138 non-null    object 
 9   transport_aircraft_fleet_strength_value  138 non-null    int64  
 10  trainer_aircraft_fleet                   138 non-n

In [17]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
military_strength_power_index,138.0,1.461,1.324,0.061,0.575,1.034,2.022,10.168
aircraft_strength_value,138.0,388.471,1231.982,0.0,33.0,111.0,290.25,13264.0
fighter/interceptor_strength_value,138.0,81.565,230.325,0.0,0.0,17.0,60.5,2085.0
attack_aircraft_strength_value,138.0,25.761,94.528,0.0,0.0,0.0,15.75,742.0
transport_aircraft_fleet_strength_value,138.0,30.232,92.33,0.0,3.0,9.0,26.0,945.0
trainer_aircraft_fleet_value,138.0,82.833,240.804,0.0,4.25,26.0,79.25,2643.0
helicopter_fleet_strength_value,138.0,154.065,520.184,0.0,15.25,44.0,126.75,5768.0
attack_helicopter_fleet_strength_value,138.0,25.623,97.326,0.0,0.0,2.0,17.75,967.0
tank_strength_value,138.0,646.565,1515.464,0.0,19.25,153.0,436.25,12950.0
afv/apc_strength_value,138.0,2485.696,5410.546,0.0,250.25,734.5,2000.0,39253.0


In [18]:
df.describe(include=object).T

Unnamed: 0,count,unique,top,freq
military_strength,138,138,Germany,1
aircraft_strength,138,138,Germany,1
fighter/interceptor_strength,138,138,Germany,1
attack_aircraft_strength,138,138,Germany,1
transport_aircraft_fleet_strength,138,138,Germany,1
trainer_aircraft_fleet,138,138,Germany,1
helicopter_fleet_strength,138,138,Germany,1
attack_helicopter_fleet_strength,138,138,Germany,1
tank_strength,138,138,Germany,1
afv/apc_strength,138,138,Germany,1


In [19]:
df.nunique()

military_strength                          138
military_strength_power_index              138
aircraft_strength                          138
aircraft_strength_value                    113
fighter/interceptor_strength               138
fighter/interceptor_strength_value          63
attack_aircraft_strength                   138
attack_aircraft_strength_value              38
transport_aircraft_fleet_strength          138
transport_aircraft_fleet_strength_value     49
trainer_aircraft_fleet                     138
trainer_aircraft_fleet_value                77
helicopter_fleet_strength                  138
helicopter_fleet_strength_value             97
attack_helicopter_fleet_strength           138
attack_helicopter_fleet_strength_value      45
tank_strength                              138
tank_strength_value                        106
afv/apc_strength                           138
afv/apc_strength_value                     130
self-propelled_artillery_strength          138
self-propelle

In [20]:
# to find how many unique values object features have
for col in df.select_dtypes(include=[np.number]).columns:
  print(f"{col} has {df[col].nunique()} unique value")

military_strength_power_index has 138 unique value
aircraft_strength_value has 113 unique value
fighter/interceptor_strength_value has 63 unique value
attack_aircraft_strength_value has 38 unique value
transport_aircraft_fleet_strength_value has 49 unique value
trainer_aircraft_fleet_value has 77 unique value
helicopter_fleet_strength_value has 97 unique value
attack_helicopter_fleet_strength_value has 45 unique value
tank_strength_value has 106 unique value
afv/apc_strength_value has 130 unique value
self-propelled_artillery_strength_value has 70 unique value
towed_artillery_strength_value has 91 unique value
rocket_projector_strength_value has 68 unique value
navy_fleet_strengths_value has 78 unique value
aircraft_carrier_fleet_strength_value has 5 unique value
submarine_fleet_strength_value has 20 unique value
destroyer_fleet_strength_value has 11 unique value
frigate_fleet_strength_value has 18 unique value
airport_totals_value has 108 unique value
total_border_coverage_value has 1

In [21]:
df.duplicated().value_counts()

False    138
dtype: int64

## Convert multi index columns to one level

In [22]:
df.head(1)

Unnamed: 0,military_strength,military_strength_power_index,aircraft_strength,aircraft_strength_value,fighter/interceptor_strength,fighter/interceptor_strength_value,attack_aircraft_strength,attack_aircraft_strength_value,transport_aircraft_fleet_strength,transport_aircraft_fleet_strength_value,trainer_aircraft_fleet,trainer_aircraft_fleet_value,helicopter_fleet_strength,helicopter_fleet_strength_value,attack_helicopter_fleet_strength,attack_helicopter_fleet_strength_value,tank_strength,tank_strength_value,afv/apc_strength,afv/apc_strength_value,self-propelled_artillery_strength,self-propelled_artillery_strength_value,towed_artillery_strength,towed_artillery_strength_value,rocket_projector_strength,rocket_projector_strength_value,navy_fleet_strengths,navy_fleet_strengths_value,aircraft_carrier_fleet_strength,aircraft_carrier_fleet_strength_value,submarine_fleet_strength,submarine_fleet_strength_value,destroyer_fleet_strength,destroyer_fleet_strength_value,frigate_fleet_strength,frigate_fleet_strength_value,defense_spending_budget,defense_spending_budget_value,external_debt,external_debt_value,airport_totals,airport_totals_value,oil_production,oil_production_value,oil_consumption,oil_consumption_value,proven_oil_reserves,proven_oil_reserves_value,available_manpower,available_manpower_value,total_population,total_population_value,total_square_land_area,total_square_land_area_value,total_coastline_coverage,total_coastline_coverage_value,total_waterway_coverage,total_waterway_coverage_value,total_border_coverage,total_border_coverage_value
0,Afghanistan,1.344,Afghanistan,260,Afghanistan,0,Afghanistan,25,Afghanistan,30,Afghanistan,0,Afghanistan,187,Afghanistan,0,Afghanistan,0,Afghanistan,1062,Afghanistan,0,Afghanistan,176,Afghanistan,50,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,12000000000,Afghanistan,2840000000,Afghanistan,43,Afghanistan,0,Afghanistan,5500,Afghanistan,0,Afghanistan,14325743,Afghanistan,34940837,Afghanistan,652230,Afghanistan,0,Afghanistan,1200,Afghanistan,5987.0


In [23]:
df.rename(columns={'military_strength': 'country'}, inplace=True)

In [24]:
df.head(1)

Unnamed: 0,country,military_strength_power_index,aircraft_strength,aircraft_strength_value,fighter/interceptor_strength,fighter/interceptor_strength_value,attack_aircraft_strength,attack_aircraft_strength_value,transport_aircraft_fleet_strength,transport_aircraft_fleet_strength_value,trainer_aircraft_fleet,trainer_aircraft_fleet_value,helicopter_fleet_strength,helicopter_fleet_strength_value,attack_helicopter_fleet_strength,attack_helicopter_fleet_strength_value,tank_strength,tank_strength_value,afv/apc_strength,afv/apc_strength_value,self-propelled_artillery_strength,self-propelled_artillery_strength_value,towed_artillery_strength,towed_artillery_strength_value,rocket_projector_strength,rocket_projector_strength_value,navy_fleet_strengths,navy_fleet_strengths_value,aircraft_carrier_fleet_strength,aircraft_carrier_fleet_strength_value,submarine_fleet_strength,submarine_fleet_strength_value,destroyer_fleet_strength,destroyer_fleet_strength_value,frigate_fleet_strength,frigate_fleet_strength_value,defense_spending_budget,defense_spending_budget_value,external_debt,external_debt_value,airport_totals,airport_totals_value,oil_production,oil_production_value,oil_consumption,oil_consumption_value,proven_oil_reserves,proven_oil_reserves_value,available_manpower,available_manpower_value,total_population,total_population_value,total_square_land_area,total_square_land_area_value,total_coastline_coverage,total_coastline_coverage_value,total_waterway_coverage,total_waterway_coverage_value,total_border_coverage,total_border_coverage_value
0,Afghanistan,1.344,Afghanistan,260,Afghanistan,0,Afghanistan,25,Afghanistan,30,Afghanistan,0,Afghanistan,187,Afghanistan,0,Afghanistan,0,Afghanistan,1062,Afghanistan,0,Afghanistan,176,Afghanistan,50,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,0.0,Afghanistan,12000000000,Afghanistan,2840000000,Afghanistan,43,Afghanistan,0,Afghanistan,5500,Afghanistan,0,Afghanistan,14325743,Afghanistan,34940837,Afghanistan,652230,Afghanistan,0,Afghanistan,1200,Afghanistan,5987.0


In [25]:
#df = df.reset_index()
#del df['index']

In [26]:
#df.head(1)

In [27]:
#df.set_index('country', inplace=True)

In [28]:
#df.head(1)

## Detect and drop duplicated columns

In [29]:
df.columns

Index(['country', 'military_strength_power_index', 'aircraft_strength',
       'aircraft_strength_value', 'fighter/interceptor_strength',
       'fighter/interceptor_strength_value', 'attack_aircraft_strength',
       'attack_aircraft_strength_value', 'transport_aircraft_fleet_strength',
       'transport_aircraft_fleet_strength_value', 'trainer_aircraft_fleet',
       'trainer_aircraft_fleet_value', 'helicopter_fleet_strength',
       'helicopter_fleet_strength_value', 'attack_helicopter_fleet_strength',
       'attack_helicopter_fleet_strength_value', 'tank_strength',
       'tank_strength_value', 'afv/apc_strength', 'afv/apc_strength_value',
       'self-propelled_artillery_strength',
       'self-propelled_artillery_strength_value', 'towed_artillery_strength',
       'towed_artillery_strength_value', 'rocket_projector_strength',
       'rocket_projector_strength_value', 'navy_fleet_strengths',
       'navy_fleet_strengths_value', 'aircraft_carrier_fleet_strength',
       'aircraft_

In [30]:
drop_columns = ['aircraft_strength',
               'fighter/interceptor_strength',
               'attack_aircraft_strength',
               'transport_aircraft_fleet_strength',
               'trainer_aircraft_fleet',
               'helicopter_fleet_strength',
               'attack_helicopter_fleet_strength',
               'tank_strength',
               'afv/apc_strength',
               'self-propelled_artillery_strength',
               'towed_artillery_strength',
               'rocket_projector_strength',
               'navy_fleet_strengths',
               'aircraft_carrier_fleet_strength',
               'submarine_fleet_strength',
               'destroyer_fleet_strength',
               'frigate_fleet_strength',
               'defense_spending_budget',
               'external_debt',
               'airport_totals', 
               'oil_production',
               'oil_consumption',
               'proven_oil_reserves',
               'available_manpower',
               'total_population',
               'total_square_land_area',
               'total_coastline_coverage',
               'total_waterway_coverage',
               'total_border_coverage',
               ]

In [31]:
df.drop(drop_columns, axis = 1, inplace = True)

In [32]:
df.head(1)

Unnamed: 0,country,military_strength_power_index,aircraft_strength_value,fighter/interceptor_strength_value,attack_aircraft_strength_value,transport_aircraft_fleet_strength_value,trainer_aircraft_fleet_value,helicopter_fleet_strength_value,attack_helicopter_fleet_strength_value,tank_strength_value,afv/apc_strength_value,self-propelled_artillery_strength_value,towed_artillery_strength_value,rocket_projector_strength_value,navy_fleet_strengths_value,aircraft_carrier_fleet_strength_value,submarine_fleet_strength_value,destroyer_fleet_strength_value,frigate_fleet_strength_value,defense_spending_budget_value,external_debt_value,airport_totals_value,oil_production_value,oil_consumption_value,proven_oil_reserves_value,available_manpower_value,total_population_value,total_square_land_area_value,total_coastline_coverage_value,total_waterway_coverage_value,total_border_coverage_value
0,Afghanistan,1.344,260,0,25,30,0,187,0,0,1062,0,176,50,0.0,0.0,0.0,0.0,0.0,12000000000,2840000000,43,0,5500,0,14325743,34940837,652230,0,1200,5987.0


In [33]:
df.shape

(138, 31)

## Check for missing values 

In [34]:
missing (df)

Unnamed: 0,Missing_Number,Missing_Percent
total_coastline_coverage_value,29,0.21
aircraft_carrier_fleet_strength_value,23,0.167
frigate_fleet_strength_value,23,0.167
destroyer_fleet_strength_value,23,0.167
submarine_fleet_strength_value,23,0.167
navy_fleet_strengths_value,14,0.101
total_border_coverage_value,9,0.065
helicopter_fleet_strength_value,0,0.0
attack_helicopter_fleet_strength_value,0,0.0
total_waterway_coverage_value,0,0.0


## Deal with missing values 
* Hint-1, use wikipedia to fill in missing values ​​comparing with similar countries
* Hint-2, Georeferences of countries can help you

In [35]:
# submarine_fleet_strength_value : South Korea --> 22
# frigate_fleet_strength_value : South Korea --> 4
# destroyer_fleet_strength_value : South Korea --> 12

In [36]:
df.loc[(df['country']=="South Kores") & (df['submarine_fleet_strength_value'].isnull()), 'submarine_fleet_strength_value'] = 22

In [37]:
df.loc[(df['country']=="South Kores") & (df['frigate_fleet_strength_value'].isnull()), 'frigate_fleet_strength_value'] = 4

In [38]:
df.loc[(df['country']=="South Kores") & (df['destroyer_fleet_strength_value'].isnull()), 'destroyer_fleet_strength_value'] = 12

In [39]:
df.fillna(0, inplace=True)

In [40]:
missing (df)

Unnamed: 0,Missing_Number,Missing_Percent
country,0,0.0
submarine_fleet_strength_value,0,0.0
total_waterway_coverage_value,0,0.0
total_coastline_coverage_value,0,0.0
total_square_land_area_value,0,0.0
total_population_value,0,0.0
available_manpower_value,0,0.0
proven_oil_reserves_value,0,0.0
oil_consumption_value,0,0.0
oil_production_value,0,0.0


In [41]:
df = df.reset_index()
del df['index']

In [42]:
df.head(1)

Unnamed: 0,country,military_strength_power_index,aircraft_strength_value,fighter/interceptor_strength_value,attack_aircraft_strength_value,transport_aircraft_fleet_strength_value,trainer_aircraft_fleet_value,helicopter_fleet_strength_value,attack_helicopter_fleet_strength_value,tank_strength_value,afv/apc_strength_value,self-propelled_artillery_strength_value,towed_artillery_strength_value,rocket_projector_strength_value,navy_fleet_strengths_value,aircraft_carrier_fleet_strength_value,submarine_fleet_strength_value,destroyer_fleet_strength_value,frigate_fleet_strength_value,defense_spending_budget_value,external_debt_value,airport_totals_value,oil_production_value,oil_consumption_value,proven_oil_reserves_value,available_manpower_value,total_population_value,total_square_land_area_value,total_coastline_coverage_value,total_waterway_coverage_value,total_border_coverage_value
0,Afghanistan,1.344,260,0,25,30,0,187,0,0,1062,0,176,50,0.0,0.0,0.0,0.0,0.0,12000000000,2840000000,43,0,5500,0,14325743,34940837,652230,0,1200,5987.0


In [43]:
df.set_index('country', inplace=True)

In [44]:
df.head(1)

Unnamed: 0_level_0,military_strength_power_index,aircraft_strength_value,fighter/interceptor_strength_value,attack_aircraft_strength_value,transport_aircraft_fleet_strength_value,trainer_aircraft_fleet_value,helicopter_fleet_strength_value,attack_helicopter_fleet_strength_value,tank_strength_value,afv/apc_strength_value,self-propelled_artillery_strength_value,towed_artillery_strength_value,rocket_projector_strength_value,navy_fleet_strengths_value,aircraft_carrier_fleet_strength_value,submarine_fleet_strength_value,destroyer_fleet_strength_value,frigate_fleet_strength_value,defense_spending_budget_value,external_debt_value,airport_totals_value,oil_production_value,oil_consumption_value,proven_oil_reserves_value,available_manpower_value,total_population_value,total_square_land_area_value,total_coastline_coverage_value,total_waterway_coverage_value,total_border_coverage_value
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
Afghanistan,1.344,260,0,25,30,0,187,0,0,1062,0,176,50,0.0,0.0,0.0,0.0,0.0,12000000000,2840000000,43,0,5500,0,14325743,34940837,652230,0,1200,5987.0


In [45]:
df.shape

(138, 30)

## Fix the dtypes

In [46]:
df.dtypes

military_strength_power_index              float64
aircraft_strength_value                      int64
fighter/interceptor_strength_value           int64
attack_aircraft_strength_value               int64
transport_aircraft_fleet_strength_value      int64
trainer_aircraft_fleet_value                 int64
helicopter_fleet_strength_value              int64
attack_helicopter_fleet_strength_value       int64
tank_strength_value                          int64
afv/apc_strength_value                       int64
self-propelled_artillery_strength_value      int64
towed_artillery_strength_value               int64
rocket_projector_strength_value              int64
navy_fleet_strengths_value                 float64
aircraft_carrier_fleet_strength_value      float64
submarine_fleet_strength_value             float64
destroyer_fleet_strength_value             float64
frigate_fleet_strength_value               float64
defense_spending_budget_value               object
external_debt_value            

In [47]:
df.head(1)

Unnamed: 0_level_0,military_strength_power_index,aircraft_strength_value,fighter/interceptor_strength_value,attack_aircraft_strength_value,transport_aircraft_fleet_strength_value,trainer_aircraft_fleet_value,helicopter_fleet_strength_value,attack_helicopter_fleet_strength_value,tank_strength_value,afv/apc_strength_value,self-propelled_artillery_strength_value,towed_artillery_strength_value,rocket_projector_strength_value,navy_fleet_strengths_value,aircraft_carrier_fleet_strength_value,submarine_fleet_strength_value,destroyer_fleet_strength_value,frigate_fleet_strength_value,defense_spending_budget_value,external_debt_value,airport_totals_value,oil_production_value,oil_consumption_value,proven_oil_reserves_value,available_manpower_value,total_population_value,total_square_land_area_value,total_coastline_coverage_value,total_waterway_coverage_value,total_border_coverage_value
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
Afghanistan,1.344,260,0,25,30,0,187,0,0,1062,0,176,50,0.0,0.0,0.0,0.0,0.0,12000000000,2840000000,43,0,5500,0,14325743,34940837,652230,0,1200,5987.0


In [48]:
for col in df.columns:
    if df.dtypes[col] == np.object:
        df[col] = df[col].apply(lambda x : x.replace(",", "") if type(x) != int else x).astype("float")

In [49]:
#df["oil_production_value"]  # BEFORE

In [50]:
#df["oil_production_value"]  # AFTER

In [51]:
missing(df)

Unnamed: 0,Missing_Number,Missing_Percent
military_strength_power_index,0,0.0
aircraft_strength_value,0,0.0
total_waterway_coverage_value,0,0.0
total_coastline_coverage_value,0,0.0
total_square_land_area_value,0,0.0
total_population_value,0,0.0
available_manpower_value,0,0.0
proven_oil_reserves_value,0,0.0
oil_consumption_value,0,0.0
oil_production_value,0,0.0


## Drop the features that not affect military power
* Hint-1,  Georeferences not proper features

In [52]:
df.head(1)

Unnamed: 0_level_0,military_strength_power_index,aircraft_strength_value,fighter/interceptor_strength_value,attack_aircraft_strength_value,transport_aircraft_fleet_strength_value,trainer_aircraft_fleet_value,helicopter_fleet_strength_value,attack_helicopter_fleet_strength_value,tank_strength_value,afv/apc_strength_value,self-propelled_artillery_strength_value,towed_artillery_strength_value,rocket_projector_strength_value,navy_fleet_strengths_value,aircraft_carrier_fleet_strength_value,submarine_fleet_strength_value,destroyer_fleet_strength_value,frigate_fleet_strength_value,defense_spending_budget_value,external_debt_value,airport_totals_value,oil_production_value,oil_consumption_value,proven_oil_reserves_value,available_manpower_value,total_population_value,total_square_land_area_value,total_coastline_coverage_value,total_waterway_coverage_value,total_border_coverage_value
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
Afghanistan,1.344,260,0,25,30,0,187,0,0,1062,0,176,50,0.0,0.0,0.0,0.0,0.0,12000000000.0,2840000000.0,43,0.0,5500.0,0.0,14325743.0,34940837.0,652230.0,0.0,1200.0,5987.0


In [53]:
df.columns

Index(['military_strength_power_index', 'aircraft_strength_value',
       'fighter/interceptor_strength_value', 'attack_aircraft_strength_value',
       'transport_aircraft_fleet_strength_value',
       'trainer_aircraft_fleet_value', 'helicopter_fleet_strength_value',
       'attack_helicopter_fleet_strength_value', 'tank_strength_value',
       'afv/apc_strength_value', 'self-propelled_artillery_strength_value',
       'towed_artillery_strength_value', 'rocket_projector_strength_value',
       'navy_fleet_strengths_value', 'aircraft_carrier_fleet_strength_value',
       'submarine_fleet_strength_value', 'destroyer_fleet_strength_value',
       'frigate_fleet_strength_value', 'defense_spending_budget_value',
       'external_debt_value', 'airport_totals_value', 'oil_production_value',
       'oil_consumption_value', 'proven_oil_reserves_value',
       'available_manpower_value', 'total_population_value',
       'total_square_land_area_value', 'total_coastline_coverage_value',
       '

In [54]:
df.shape

(138, 30)

In [55]:
for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].min() == 0:
        print(col)

aircraft_strength_value
fighter/interceptor_strength_value
attack_aircraft_strength_value
transport_aircraft_fleet_strength_value
trainer_aircraft_fleet_value
helicopter_fleet_strength_value
attack_helicopter_fleet_strength_value
tank_strength_value
afv/apc_strength_value
self-propelled_artillery_strength_value
towed_artillery_strength_value
rocket_projector_strength_value
navy_fleet_strengths_value
aircraft_carrier_fleet_strength_value
submarine_fleet_strength_value
destroyer_fleet_strength_value
frigate_fleet_strength_value
oil_production_value
proven_oil_reserves_value
total_coastline_coverage_value
total_waterway_coverage_value
total_border_coverage_value


In [56]:
df[df["aircraft_strength_value"]==0]["aircraft_strength_value"]

country
Liberia    0
Somalia    0
Name: aircraft_strength_value, dtype: int64

In [57]:
df[df["total_coastline_coverage_value"]==0]["total_coastline_coverage_value"]

country
Afghanistan                0.000
Armenia                    0.000
Austria                    0.000
Belarus                    0.000
Bhutan                     0.000
Bolivia                    0.000
Botswana                   0.000
Burkina Faso               0.000
Central African Republic   0.000
Chad                       0.000
Czechia                    0.000
Ethiopia                   0.000
Hungary                    0.000
Kyrgyzstan                 0.000
Laos                       0.000
Mali                       0.000
Moldova                    0.000
Mongolia                   0.000
Nepal                      0.000
Niger                      0.000
North Macedonia            0.000
Paraguay                   0.000
Serbia                     0.000
Slovakia                   0.000
South Sudan                0.000
Switzerland                0.000
Tajikistan                 0.000
Uganda                     0.000
Zambia                     0.000
Zimbabwe                   0.000
Na

In [58]:
df[df["total_waterway_coverage_value"]==0]["total_waterway_coverage_value"]

country
Algeria                  0.000
Armenia                  0.000
Austria                  0.000
Azerbaijan               0.000
Bahrain                  0.000
Bhutan                   0.000
Bosnia and Herzegovina   0.000
Botswana                 0.000
Burkina Faso             0.000
Cameroon                 0.000
Chad                     0.000
El Salvador              0.000
Israel                   0.000
Kazakhstan               0.000
Kenya                    0.000
Kuwait                   0.000
Liberia                  0.000
Libya                    0.000
Mali                     0.000
Mauritania               0.000
Montenegro               0.000
Morocco                  0.000
Namibia                  0.000
Nepal                    0.000
New Zealand              0.000
Oman                     0.000
Qatar                    0.000
Saudi Arabia             0.000
Slovenia                 0.000
Somalia                  0.000
South Africa             0.000
South Sudan              0.000


In [59]:
df[df["total_border_coverage_value"]==0]["total_border_coverage_value"]

country
Australia     0.000
Bahrain       0.000
Japan         0.000
Madagascar    0.000
New Zealand   0.000
Philippines   0.000
Singapore     0.000
Sri Lanka     0.000
Taiwan        0.000
Name: total_border_coverage_value, dtype: float64

In [60]:
df[df["total_square_land_area_value"]==0]["total_square_land_area_value"]

Series([], Name: total_square_land_area_value, dtype: float64)

In [61]:
drop_columns = ["total_coastline_coverage_value",
               "total_waterway_coverage_value",
               "total_border_coverage_value",
               "total_square_land_area_value"]

In [62]:
df.drop(drop_columns, axis = 1, inplace = True)

In [63]:
df.shape

(138, 26)

In [64]:
df.columns

Index(['military_strength_power_index', 'aircraft_strength_value',
       'fighter/interceptor_strength_value', 'attack_aircraft_strength_value',
       'transport_aircraft_fleet_strength_value',
       'trainer_aircraft_fleet_value', 'helicopter_fleet_strength_value',
       'attack_helicopter_fleet_strength_value', 'tank_strength_value',
       'afv/apc_strength_value', 'self-propelled_artillery_strength_value',
       'towed_artillery_strength_value', 'rocket_projector_strength_value',
       'navy_fleet_strengths_value', 'aircraft_carrier_fleet_strength_value',
       'submarine_fleet_strength_value', 'destroyer_fleet_strength_value',
       'frigate_fleet_strength_value', 'defense_spending_budget_value',
       'external_debt_value', 'airport_totals_value', 'oil_production_value',
       'oil_consumption_value', 'proven_oil_reserves_value',
       'available_manpower_value', 'total_population_value'],
      dtype='object')

## Visualy inspect the some features

In [65]:
df.head(1)

Unnamed: 0_level_0,military_strength_power_index,aircraft_strength_value,fighter/interceptor_strength_value,attack_aircraft_strength_value,transport_aircraft_fleet_strength_value,trainer_aircraft_fleet_value,helicopter_fleet_strength_value,attack_helicopter_fleet_strength_value,tank_strength_value,afv/apc_strength_value,self-propelled_artillery_strength_value,towed_artillery_strength_value,rocket_projector_strength_value,navy_fleet_strengths_value,aircraft_carrier_fleet_strength_value,submarine_fleet_strength_value,destroyer_fleet_strength_value,frigate_fleet_strength_value,defense_spending_budget_value,external_debt_value,airport_totals_value,oil_production_value,oil_consumption_value,proven_oil_reserves_value,available_manpower_value,total_population_value
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Afghanistan,1.344,260,0,25,30,0,187,0,0,1062,0,176,50,0.0,0.0,0.0,0.0,0.0,12000000000.0,2840000000.0,43,0.0,5500.0,0.0,14325743.0,34940837.0


In [66]:
# sns.pairplot(df, palette="inferno", corner=True);

In [67]:
# for col in df.columns:
#     plt.figure(figsize = (20,6))
#     sns.barplot(y = df[col], x = df.index, data = df)
#     plt.xticks(rotation = 45);

## Descriptive statistics

In [68]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
military_strength_power_index,138.0,1.461,1.324,0.061,0.575,1.034,2.022,10.168
aircraft_strength_value,138.0,388.471,1231.982,0.0,33.0,111.0,290.25,13264.0
fighter/interceptor_strength_value,138.0,81.565,230.325,0.0,0.0,17.0,60.5,2085.0
attack_aircraft_strength_value,138.0,25.761,94.528,0.0,0.0,0.0,15.75,742.0
transport_aircraft_fleet_strength_value,138.0,30.232,92.33,0.0,3.0,9.0,26.0,945.0
trainer_aircraft_fleet_value,138.0,82.833,240.804,0.0,4.25,26.0,79.25,2643.0
helicopter_fleet_strength_value,138.0,154.065,520.184,0.0,15.25,44.0,126.75,5768.0
attack_helicopter_fleet_strength_value,138.0,25.623,97.326,0.0,0.0,2.0,17.75,967.0
tank_strength_value,138.0,646.565,1515.464,0.0,19.25,153.0,436.25,12950.0
afv/apc_strength_value,138.0,2485.696,5410.546,0.0,250.25,734.5,2000.0,39253.0


In [69]:
skew_vals = df.skew().sort_values(ascending=False)
skew_vals

defense_spending_budget_value             9.959
airport_totals_value                      9.845
aircraft_carrier_fleet_strength_value     9.772
helicopter_fleet_strength_value           9.497
trainer_aircraft_fleet_value              9.035
aircraft_strength_value                   8.789
transport_aircraft_fleet_strength_value   7.951
attack_helicopter_fleet_strength_value    7.883
destroyer_fleet_strength_value            7.850
available_manpower_value                  7.377
total_population_value                    7.110
external_debt_value                       7.052
oil_consumption_value                     6.874
attack_aircraft_strength_value            6.489
self-propelled_artillery_strength_value   6.258
fighter/interceptor_strength_value        6.226
rocket_projector_strength_value           5.345
tank_strength_value                       5.082
proven_oil_reserves_value                 4.881
submarine_fleet_strength_value            4.790
afv/apc_strength_value                  

In [70]:
skew_limit = 0.5 # This is our threshold-limit to evaluate skewness. Overall below abs(1) seems acceptable for the linear models. 
skew_vals = df.skew()
skew_cols = skew_vals[abs(skew_vals)> skew_limit].sort_values(ascending=False)
skew_cols 

defense_spending_budget_value             9.959
airport_totals_value                      9.845
aircraft_carrier_fleet_strength_value     9.772
helicopter_fleet_strength_value           9.497
trainer_aircraft_fleet_value              9.035
aircraft_strength_value                   8.789
transport_aircraft_fleet_strength_value   7.951
attack_helicopter_fleet_strength_value    7.883
destroyer_fleet_strength_value            7.850
available_manpower_value                  7.377
total_population_value                    7.110
external_debt_value                       7.052
oil_consumption_value                     6.874
attack_aircraft_strength_value            6.489
self-propelled_artillery_strength_value   6.258
fighter/interceptor_strength_value        6.226
rocket_projector_strength_value           5.345
tank_strength_value                       5.082
proven_oil_reserves_value                 4.881
submarine_fleet_strength_value            4.790
afv/apc_strength_value                  

In [71]:
#Interpreting Skewness 

for skew in skew_vals:
    if -0.5 < skew < 0.5:
        print ("A skewness value of", '\033[1m', Fore.GREEN, skew, '\033[0m', "means that the distribution is approx.", '\033[1m', Fore.GREEN, "symmetric", '\033[0m')
    elif  -0.5 < skew < -1.0 or 0.5 < skew < 1.0:
        print ("A skewness value of", '\033[1m', Fore.YELLOW, skew, '\033[0m', "means that the distribution is approx.", '\033[1m', Fore.YELLOW, "moderately skewed", '\033[0m')
    else:
        print ("A skewness value of", '\033[1m', Fore.RED, skew, '\033[0m', "means that the distribution is approx.", '\033[1m', Fore.RED, "highly skewed", '\033[0m')

A skewness value of [1m [31m 2.6706592171090633 [0m means that the distribution is approx. [1m [31m highly skewed [0m
A skewness value of [1m [31m 8.789333213231357 [0m means that the distribution is approx. [1m [31m highly skewed [0m
A skewness value of [1m [31m 6.225776745899961 [0m means that the distribution is approx. [1m [31m highly skewed [0m
A skewness value of [1m [31m 6.489261954280916 [0m means that the distribution is approx. [1m [31m highly skewed [0m
A skewness value of [1m [31m 7.95096255063359 [0m means that the distribution is approx. [1m [31m highly skewed [0m
A skewness value of [1m [31m 9.035121167203796 [0m means that the distribution is approx. [1m [31m highly skewed [0m
A skewness value of [1m [31m 9.496532868859475 [0m means that the distribution is approx. [1m [31m highly skewed [0m
A skewness value of [1m [31m 7.882557750772219 [0m means that the distribution is approx. [1m [31m highly skewed [0m
A skewness value

In [72]:
kurtosis_vals = df.kurtosis().sort_values(ascending=False)
kurtosis_vals

defense_spending_budget_value             106.869
airport_totals_value                      105.399
aircraft_carrier_fleet_strength_value     105.211
helicopter_fleet_strength_value           100.829
trainer_aircraft_fleet_value               94.603
aircraft_strength_value                    88.850
transport_aircraft_fleet_strength_value    73.350
destroyer_fleet_strength_value             69.845
attack_helicopter_fleet_strength_value     69.606
external_debt_value                        59.907
available_manpower_value                   57.864
oil_consumption_value                      54.228
total_population_value                     53.839
fighter/interceptor_strength_value         46.648
self-propelled_artillery_strength_value    45.347
attack_aircraft_strength_value             45.044
tank_strength_value                        33.708
rocket_projector_strength_value            32.166
frigate_fleet_strength_value               31.408
proven_oil_reserves_value                  25.894


In [73]:
#Calculating Kurtosis 

kurtosis_limit = 7 # This is our threshold-limit to evaluate skewness. Overall below abs(1) seems acceptable for the linear models.
kurtosis_vals = df.kurtosis()
kurtosis_cols = kurtosis_vals[abs(kurtosis_vals) > kurtosis_limit].sort_values(ascending=False)
kurtosis_cols

defense_spending_budget_value             106.869
airport_totals_value                      105.399
aircraft_carrier_fleet_strength_value     105.211
helicopter_fleet_strength_value           100.829
trainer_aircraft_fleet_value               94.603
aircraft_strength_value                    88.850
transport_aircraft_fleet_strength_value    73.350
destroyer_fleet_strength_value             69.845
attack_helicopter_fleet_strength_value     69.606
external_debt_value                        59.907
available_manpower_value                   57.864
oil_consumption_value                      54.228
total_population_value                     53.839
fighter/interceptor_strength_value         46.648
self-propelled_artillery_strength_value    45.347
attack_aircraft_strength_value             45.044
tank_strength_value                        33.708
rocket_projector_strength_value            32.166
frigate_fleet_strength_value               31.408
proven_oil_reserves_value                  25.894


# Data Preprocessing

# Modelling

## Hopkins Test 
#### Assess the clusterability of a dataset. A score between 0 and 1, a score around 0.5 express no clusterability, a score tending to 1 express an uniform distrubution so clustering can't be useful and  0 express  not an uniform distrubution hence clusterering can be useful.

In [74]:
from pyclustertend import hopkins

ModuleNotFoundError: No module named 'pyclustertend'

In [None]:
hopkins(df, df.shape[0])

In [None]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, scale

In [None]:
hopkins(scale(df1), df1.shape[0])

In [None]:
hopkins(MinMaxScaler().fit_transform(df2), df2.shape[0])

In [None]:
hopkins(StandardScaler().fit_transform(df3), df3.shape[0])

In [None]:
hopkins(scale(df), df.shape[0])

## Clustering with K-means

In [None]:
df1 = df.copy()

In [None]:
df1.reset_index(inplace=True)

In [None]:
X = df1.drop("country", axis = 1)

In [None]:
scaler = scale

In [None]:
X_scaled = pd.DataFrame(scaler(X))

In [None]:
X_scaled.head(1)

In [None]:
X_scaled.shape

In [None]:
K_means_model = KMeans(random_state = 101)

In [None]:
K_means_model.fit_predict(X)

In [None]:
# K_means_model.fit(X)

In [None]:
# K_means_model.labels_

In [None]:
# X["clusters"] = K_means_model.labels_

In [None]:
# X

### Elbow Method 
We use this method to find optimal k value by looking up to break point of graph. 

In [None]:
ssd = []

K = range(2,10)

for k in K:
    model = KMeans(n_clusters =k, random_state=101)
    model.fit(X_scaled)
    ssd.append(model.inertia_)

In [None]:
plt.plot(K, ssd, "bo-")
plt.xlabel("Different k values")
plt.ylabel("inertia-error") 
plt.title("elbow method") 

In [None]:
ssd # sum of squared distance

In [None]:
pd.Series(ssd).diff()

In [None]:
df1_diff =pd.DataFrame(-pd.Series(ssd).diff()).rename(index = lambda x : x+1)
df1_diff

In [None]:
df1_diff.plot(kind='bar')

In [None]:
from yellowbrick.cluster import KElbowVisualizer

model_ = KMeans(random_state=101)
visualizer = KElbowVisualizer(model_, k=(2,9))

visualizer.fit(X_scaled)        # Fit the data to the visualizer
visualizer.show();

### Silhouette analysis

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
silhouette_score(X_scaled, K_means_model.labels_)

In [None]:
range_n_clusters = range(2,9)
for num_clusters in range_n_clusters:
    # intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=101)
    kmeans.fit(X_scaled)
    cluster_labels = kmeans.labels_
    # silhouette score
    silhouette_avg = silhouette_score(X_scaled, cluster_labels)
    print(f"For n_clusters={num_clusters}, the silhouette score is {silhouette_avg}")

In [None]:
from sklearn.cluster import KMeans

from yellowbrick.cluster import SilhouetteVisualizer

model3 = KMeans(n_clusters=4, random_state=101)
visualizer = SilhouetteVisualizer(model3)

visualizer.fit(X_scaled)    # Fit the data to the visualizer
visualizer.poof();

### Model Building and label visualisation

In [None]:
model = KMeans(n_clusters =4, random_state=101)
model.fit_predict(X_scaled)

In [None]:
model.labels_

In [None]:
X["clusters"] = model.fit_predict(X_scaled)

In [None]:
X.head()

In [None]:
X.clusters.value_counts()

In [None]:
df1.head()

In [None]:
df1["predicted_clusters"] = model.fit_predict(X_scaled)

In [None]:
df1.tail(10)

In [None]:
df1[df1["predicted_clusters"]==0]["country"]

In [None]:
df1[df1["predicted_clusters"]==1]["country"]

In [None]:
df1[df1["predicted_clusters"]==2]["country"]

In [None]:
df1[df1["predicted_clusters"]==3]["country"]

In [None]:
df1["predicted_clusters"].value_counts().plot(kind="pie", autopct='%1.1f%%',figsize=(10,10));

In [None]:
sns.countplot(x=df1["predicted_clusters"], data=df1)

### Evaluate 

## Hierarchical Clustering

In [None]:
df2 = df.reset_index()

In [None]:
df2

In [None]:
df2 = df2.set_index("country")

In [None]:
X_scaled = scale(df2)

In [None]:
hopkins(X_scaled, X.shape[0])

In [None]:
#!pip install pyclustertend
from pyclustertend import hopkins
hopkins(X_scaled, X.shape[0])

### Dendogram

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
plt.figure(figsize = (20,8))
plt.title("Dendrogram")
plt.xlabel("Observations")
plt.ylabel("Distance")
dendrogram(linkage(X_scaled, method = "ward"), leaf_font_size = 5);

In [None]:
hc_ward = linkage(y = X_scaled, method = "ward")
hc_complete = linkage(X_scaled, "complete")
hc_average = linkage(X_scaled, "average")
hc_single = linkage(X_scaled, "single")

plt.figure(figsize = (20,12))

plt.subplot(221)
plt.title("Ward")
plt.xlabel("Observations")
plt.ylabel("Distance")
dendrogram(hc_ward, leaf_font_size = 10)

plt.subplot(222)
plt.title("Complete")
plt.xlabel("Observations")
plt.ylabel("Distance")
dendrogram(hc_complete, leaf_font_size = 10)

plt.subplot(223)
plt.title("Average")
plt.xlabel("Observations")
plt.ylabel("Distance")
dendrogram(hc_average, leaf_font_size = 10)

plt.subplot(224)
plt.title("Single")
plt.xlabel("Observations")
plt.ylabel("Distance")
dendrogram(hc_single, leaf_font_size = 10);

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
K = range(2,10)

for k in K:
    model_labels = AgglomerativeClustering(n_clusters = k,  
                                    affinity='euclidean', 
                                    linkage='ward').fit_predict(X_scaled)
    print(f'Silhouette Score for {k} clusters: {silhouette_score(X_scaled, model_labels)}')


In [None]:
plt.figure(figsize = (20,6))

plt.subplot(131)
sns.boxplot(data = df, y = "Murder", x = "predicted_clusters")

plt.subplot(132)
sns.boxplot(data = df, y = "Assault", x = "predicted_clusters")

plt.subplot(133)
sns.boxplot(data = df, y = "Rape", x = "predicted_clusters");

In [None]:
model1_labels = AgglomerativeClustering(n_clusters = 5,
                                         affinity='euclidean',
                                         linkage='ward').fit_predict(X_scaled)

In [None]:
len(df.columns)

In [None]:
for i in range(1, len(df.columns)):
    plt.scatter(df2.iloc[:,0], df2.iloc[:,i], c = model1_labels, cmap = "viridis");

In [None]:
plt.figure(figsize = (20,60))

plt.subplot(9,3,1)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,1], c = model1_labels, cmap = "viridis");
plt.subplot(9,3,2)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,2], c = model1_labels, cmap = "viridis");
plt.subplot(9,3,3)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,3], c = model1_labels, cmap = "viridis");
plt.subplot(9,3,4)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,4], c = model1_labels, cmap = "viridis");
plt.subplot(9,3,5)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,5], c = model1_labels, cmap = "viridis");
plt.subplot(9,3,6)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,6], c = model1_labels, cmap = "viridis");
plt.subplot(9,3,7)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,7], c = model1_labels, cmap = "viridis");
plt.subplot(9,3,8)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,8], c = model1_labels, cmap = "viridis");
plt.subplot(9,3,9)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,9], c = model1_labels, cmap = "viridis");
plt.subplot(9,3,10)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,10], c = model1_labels, cmap = "viridis");
plt.subplot(9,3,11)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,11], c = model1_labels, cmap = "viridis");
plt.subplot(9,3,12)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,12], c = model1_labels, cmap = "viridis");

In [None]:
clusters = model1_labels
df2["predicted_clusters_hc"] = clusters

In [None]:
df2.head()

In [None]:
df2["predicted_clusters_hc"].value_counts()

In [None]:
df2[df2["predicted_clusters_hc"]==0].index

In [None]:
df2[df2["predicted_clusters_hc"]==1].index

In [None]:
df2[df2["predicted_clusters_hc"]==2].index

In [None]:
df2[df2["predicted_clusters_hc"]==3].index

In [None]:
df2[df2["predicted_clusters_hc"]==4].index

In [None]:
df2["military_power"] = df2["predicted_clusters_hc"].map({  0 : "medium",
                                                    1 : "powerfull2",
                                                    2 : "most_powerful",
                                                    3 : "powerfull1",
                                                    4 : "low",
                                                 })

In [None]:
df2

In [None]:
df2 = df2.reset_index()

In [None]:
df2.groupby("predicted_clusters_hc")["country"].unique()

In [None]:
df2["military_power"].value_counts()

In [None]:
df2.groupby("military_power")["country"].unique()["low"]

In [None]:
df2.groupby("military_power")["country"].unique()["medium"]

In [None]:
df2.groupby("military_power")["country"].unique()["powerfull2"]

In [None]:
df2.groupby("military_power")["country"].unique()["powerfull1"]

In [None]:
df2.groupby("military_power")["country"].unique()["most_powerful"]

In [None]:
class0 = df2.nlargest(138, "military_strength_power_index").iloc[0:111, 0]

In [None]:
class1 = df2.nlargest(138, "military_strength_power_index").iloc[111:135, 0]

In [None]:
class2 = df2.nlargest(138, "military_strength_power_index").iloc[135:137, 0]
class2

In [None]:
class3 = df2.nlargest(138, "military_strength_power_index").iloc[137:138, 0]
class3

In [None]:
df2["domain_class"] = ""

In [None]:
df2["domain_class"][0:111]=0
df2["domain_class"][111:135]=1
df2["domain_class"][135:137]=2
df2["domain_class"][137:138]=3

In [None]:
df2["domain_class"].value_counts(dropna=False)

In [None]:
df2[["domain_class","predicted_clusters_hc"]]

In [None]:
ct = pd.crosstab(df2["domain_class"],df2["predicted_clusters_hc"])
ct

In [None]:
plt.figure(figsize = (20,6))

plt.subplot(121)
plt.title("Original")
plt.scatter(df2.iloc[:,0], df2.iloc[:,1], c = df2.domain_class, s = 50, cmap = "rainbow")

plt.subplot(122)
plt.title("Predicted")
plt.scatter(df2.iloc[:,0], df2.iloc[:,1], c = df2.domain_class, s = 50, cmap = "rainbow");

In [None]:
plt.figure(figsize = (20,6))

plt.subplot(121)
plt.title("Original")
plt.scatter(df2.iloc[:,2], df2.iloc[:,3], c = df2.domain_class, s = 50, cmap = "rainbow")

plt.subplot(122)
plt.title("Predicted")
plt.scatter(df2.iloc[:,2], df2.iloc[:,3], c = df2.domain_class, s = 50, cmap = "rainbow");

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score
adjusted_rand_score(df2['domain_class'], df2['predicted_clusters'])

### Model Building and label visualisation

### Evaluate 