In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler,LabelEncoder
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import warnings 
warnings.filterwarnings('ignore')

Loading Dataset

In [None]:
df = pd.read_csv('defence_mission_dataset.csv')
df.head()

In [None]:
df.info()

Checking for Missing Values

In [None]:
df.isnull().sum()

Handling Null values

In [None]:
df['Morale_Score'].fillna(df['Morale_Score'].median(), inplace= True)
df['Supply_Level'].fillna(df['Supply_Level'].mode()[0], inplace= True)
df.isna().sum()

In [None]:
df.describe()

Outliers Detection

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].boxplot(df['Resilience_Index'])
ax[0].set_title('Resilience_Index')
ax[1].boxplot(df['Success_Rate'])
ax[1].set_title('Success_Rate')
plt.tight_layout()
plt.show()


Handling Outliers

In [None]:
q1 = df['Resilience_Index'].quantile(0.25)
q3 = df['Resilience_Index'].quantile(0.75)
iqr = q3 -q1
upper_limit = q3 + 1.5 * iqr
lower_limit = q1 - 1.5 * iqr

df['Resilience_Index'] = np.where(df['Resilience_Index']>upper_limit, upper_limit, np.where(df['Resilience_Index']<lower_limit, lower_limit, df['Resilience_Index']))

plt.boxplot(df['Resilience_Index'])
plt.title('Resilience_Index')
plt.show()

In [None]:
q1 = df['Success_Rate'].quantile(0.25)
q3 = df['Success_Rate'].quantile(0.75)
iqr = q3 -q1
upper_limit = q3 + 1.5 * iqr
lower_limit = q1 - 1.5 * iqr

df['Success_Rate'] = np.where(df['Success_Rate']>upper_limit, upper_limit, np.where(df['Success_Rate']<lower_limit, lower_limit, df['Success_Rate']))

plt.boxplot(df['Success_Rate'])
plt.title('Success_Rate')
plt.show()

Univariate Analysis

In [None]:
plt.pie(x=df['Unit_Type'].value_counts(), autopct= '%.2f%%', labels= df['Unit_Type'].unique(), colors= sns.color_palette('Set2'))
plt.title('Distribution of Unit types')
plt.show()

Interpretation: The pie chart illustrates the distribution of unit types. Aviation and Armored units comprise approximately 24.5% each, followed by Unknown and Infantry units at around 24.4% and 24.2% respectively. Artillery units represent a smaller portion at 2.00%. 


In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x=df['Training_Level'], palette= 'viridis')
plt.title('Distribution of Training Level')
plt.xlabel('Training Level')
plt.show()


Interpretation: The violin plot illustrates the distribution of training levels across intermediate, basic, and advanced categories. While the basic level exhibits a wider range of values and a more symmetrical distribution, intermediate and advanced levels show a denser concentration of data points with longer tails, suggesting greater variability in those groups. 


In [None]:
plt.figure(figsize=(15, 4))
sns.lineplot(df['Communication_Quality'].head(1000), marker='.', palette= 'viridis')
plt.title('Communication Quality Over Time')
plt.xlabel('Index')
plt.ylabel('Communication Quality')
plt.grid(True)
plt.show()


Interpretation: The line chart illustrates significant fluctuations in communication quality over time. Despite a potential scale ranging from 0 to 1000, the actual values remain consistently low, fluctuating between 0 and 10. This indicates a high degree of variability in communication quality with no discernible pattern. The data lacks a clear trend or predictability, suggesting inconsistent communication performance throughout the observed period. 


In [None]:
plt.figure(figsize=(10, 4))
sns.barplot(x=df['Base_Location'].value_counts().index, 
            y=df['Base_Location'].value_counts().values, 
            palette='viridis')
plt.title('Base Location')
plt.xlabel('Base Location')
plt.ylabel('Count')
plt.show()


Interpretation: The bar chart displays the distribution of base locations. Coastal locations have the highest count, followed closely by mountainous and rural areas. Urban locations have a significantly lower count, and the number of unknown locations is minimal.

Bivariate Analysis

In [None]:
plt.figure(figsize=(15,4)),
sns.lineplot(data= df, x= 'Base_Location', y= 'Communication_Quality',  palette= 'rocket')
plt.show()

Interpretation: The chart illustrates the relationship between communication quality and base location. There's a slight upward trend in communication quality from urban to mountainous locations, followed by a more pronounced increase for coastal and unknown locations. The variability in communication quality is relatively consistent across different base locations.

In [None]:
sns.barplot(data = df.head(200), x = 'Engagement_Frequency', y = 'Casualty_Count', palette= 'viridis')

Interpretation: The graph illustrates a positive correlation between engagement frequency and casualty count. Higher engagement frequency is associated with a greater average number of casualties. However, the variability in casualty count is higher for less frequent engagements.

In [None]:
plt.figure(figsize=(8, 4))
sns.heatmap(df[['Morale_Score', 'Communication_Quality', 'Casualty_Count',
       'Resilience_Index', 'Success_Rate']].corr(), annot=True, cmap='rocket', linewidths=0.5)
plt.title('Correlation Analysis')
plt.show()


Interpretation: The correlation matrix reveals negligible relationships between variables. Morale Score, Communication Quality, Casualty Count, Resilience Index, and Success Rate exhibit minimal correlation with each other, suggesting independent influences on the overall outcome.

Selecting Important features using ensemble technique to reduce entropy from the dataset and analysing the features who are having the best predicting value by leveraging entropy to minimize impurity within the dataset, feature importance can be assessed based on their contribution to predicting the target variable.


In [None]:
features = df[[
'Success_Rate',
'Resilience_Index',
'Communication_Quality',
'Morale_Score',
'Casualty_Count',
'Base_Location',
'Unit_Type',
'Equipment_Readiness',
'Operation_Type',
'Engagement_Frequency']]


In [None]:
cat = features.select_dtypes(include='object',)

Label Encoding

Data Splitting

In [None]:
categorical_features = features
target = df['Mission_Outcome']

In [None]:
target.value_counts()

In [None]:
scaler =  MinMaxScaler()
categorical_features = scaler.fit_transform(categorical_features)
categorical_features

In [None]:
x_train, x_test, y_train, y_test = train_test_split(categorical_features, target, train_size=.8, random_state=0)

Model Building

Logistic Regression

In [None]:
lg = LogisticRegression(max_iter=1000)
lg.fit(x_train, y_train)
y_pred = lg.predict(x_test)
accuracy_score(y_test, y_pred)

Random Forest 

In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
accuracy_score(y_test, y_pred)

Decision Tree

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(x_train, y_train)
y_pred = dtree.predict(x_test)
accuracy_score(y_test, y_pred)

Support Vector Machine

In [None]:
svm = SVC(kernel='rbf')
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)
accuracy_score(y_test, y_pred)

K-Nearest Neighbours

In [None]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(x_train, y_train)
y_pred = lg.predict(x_test)
accuracy_score(y_test, y_pred)

Model Evaluation Metrics

In [None]:
cf = classification_report(y_test, y_pred)
print(cf)


Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot= True, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Y_truth')
plt.ylabel('Y_prediction')
plt.show()

Model Deployed into Pickle file 

In [None]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(rf, f)


In [None]:
with open("model.pkl", 'rb') as f:
    model = pickle.load(f)

model.predict(x_test)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score

# Generate a toy dataset with 3 classes
X, y = make_classification(n_samples=1000, n_features=20, n_classes=3, n_informative=3, random_state=42)

# Binarize the output labels (for One-vs-Rest)
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the classifier using One-vs-Rest strategy
clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=100))
clf.fit(X_train, y_train)

# Get the probability predictions
y_score = clf.predict_proba(X_test)

# Compute ROC curve and ROC AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting the ROC curve for each class
plt.figure()
colors = ['aqua', 'darkorange', 'cornflowerblue']
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'ROC curve of class {i} (area = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Multiclass')
plt.legend(loc="lower right")
plt.show()
