In [9]:

#This block is creating the Linear Regression Model for Average General Performance
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

data = pd.read_csv('/Users/bryantreese/desktop/current_run.csv')


troop_types = ['Infantry', 'Cavalry', 'Artillery', 'Ships', 'Airforce', 'Special']

# Assuming missing troop counts mean zero troops
for troop in troop_types:
    
    data[troop] = data[troop].replace([np.inf, -np.inf], np.nan).fillna(0)


for troop in troop_types:
    data[f'{troop}_difference'] = 0

# Calculate the differences for each troop type based on position
for index, row in data.iterrows():
    if row['pos'] == 'L':
        for troop in troop_types:
            data.loc[index, f'{troop}_difference'] += row[troop]
    elif row['pos'] == 'R':
        for troop in troop_types:
            data.loc[index, f'{troop}_difference'] -= row[troop]

# Handling NaN values in 'VorD' as I and convert to numerical
data['VorD'] = data['VorD'].map({'V': 1, 'D': -1, 'I': 0}).fillna(0)


features = [f'{troop}_difference' for troop in troop_types]
X = data[features]
y = data['VorD']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LinearRegression()
model.fit(X_train, y_train)

# Predictions and error calculation
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')


coefficients = {feature: coef for feature, coef in zip(features, model.coef_)}
print('Model coefficients:', coefficients)


Mean Squared Error: 0.9097729708650691
Model coefficients: {'Infantry_difference': 3.164860307656626e-07, 'Cavalry_difference': 1.0147512443904323e-05, 'Artillery_difference': -3.644926359295697e-05, 'Ships_difference': 0.00014383250089455512, 'Airforce_difference': -0.00014478512223787228, 'Special_difference': 2.3016291091711602e-05}


In [14]:
#This Code Block takes the linear regression model and calculates eaceh generals WAR, aWAR and number of battles


data['predicted_VorD'] = model.predict(X)


data['performance_difference'] = -(data['predicted_VorD'] - data['VorD'])


belligerent_performance = data.groupby('belligerent').agg(
    number_of_Battles=('belligerent', 'size'),
    WAR=('performance_difference', 'sum'),
    aWAR=('performance_difference', 'mean')
).reset_index()

# Rename 'belligerent' column to 'General'
belligerent_performance.rename(columns={'belligerent': 'General'}, inplace=True)


belligerent_performance.to_csv('/Users/bryantreese/desktop/Generals_Performance.csv', index=False)


In [15]:
#MSE of the Overall model

mse = mean_squared_error(y, data['predicted_VorD'])
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.9132898237756123


In [20]:
#Some Outlier Analysis


belligerent_performance_outlier = belligerent_performance[belligerent_performance['number_of_battles'] >= 3]

top_10_war = belligerent_performance_outlier.nlargest(10, 'WAR')
worst_10_war = belligerent_performance_outlier.nsmallest(10, 'WAR')


top_10_awar = belligerent_performance_outlier.nlargest(10, 'aWAR')
worst_10_awar = belligerent_performance_outlier.nsmallest(10, 'aWAR')


outliers = pd.concat([top_10_war, worst_10_war, top_10_awar, worst_10_awar]).drop_duplicates().reset_index(drop=True)


outliers.to_csv('/Users/bryantreese/desktop/Outlier_Performers.csv', index=False)


print(outliers)

                                       General  number_of_battles        WAR  \
0                                     Napoleon                 43  30.165273   
1     Arthur Wellesley, 1st Duke of Wellington                 18  14.223505   
2                                Julius Caesar                 17  12.732013   
3                               Takeda Shingen                 18  12.090567   
4                          Khalid ibn al-Walid                 14  10.580268   
5                             Ulysses S. Grant                 16   9.857647   
6                                     Hannibal                 17   9.657072   
7                                 Oda Nobunaga                 11   8.435425   
8                          Alexander the Great                  9   8.254001   
9                          Frederick the Great                 14   8.189264   
10                          Capital punishment                 16 -10.994661   
11                            Max von Ga

In [24]:
performance_data = pd.read_csv('/Users/bryantreese/desktop/Generals_Performance.csv')
# Calculate the correlation between Battles and WAR + Battles and aWAR
WB_correlation = performance_data['number_of_battles'].corr(performance_data['WAR'])
aWB_correlation = performance_data['number_of_battles'].corr(performance_data['aWAR'])
# Print the correlation
print("Correlation between Number of Battles and WAR:", WB_correlation)
print("Correlation between Number of Battles and aWAR:", aWB_correlation)

Correlation between Number of Battles and WAR: 0.2657519806077616
Correlation between Number of Battles and aWAR: 0.059878398289835655
