## **Importing the Required Libraries**

In [1]:
import scipy.io
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

## **Importing the DataSet.mat**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = scipy.io.loadmat('/content/drive/MyDrive/mlProject/influenza_outbreak_dataset (1).mat')
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'flu_X_tr', 'flu_Y_tr', 'flu_X_te', 'flu_Y_te', 'flu_locs', 'flu_keywords'])

## **Extracting the data to work on**

In [4]:
x_train = []
y_train = []
x_test = []
y_test = []

for i in range(48):
    x_train.append(data['flu_X_tr'][0][i].todense())
    x_test.append(data['flu_X_te'][0][i].todense())
    
    y_train.append(data['flu_Y_tr'][0][i])
    y_test.append(data['flu_Y_te'][0][i])


In [5]:
x_train=np.array(x_train)
x_train = x_train.reshape(x_train.shape[0]*x_train.shape[1],x_train.shape[2])
x_train = pd.DataFrame(x_train)

x_test = np.array(x_test)
x_test = x_test.reshape(x_test.shape[0]*x_test.shape[1],x_test.shape[2])
x_test = pd.DataFrame(x_test)

print("x_train: ",x_train.shape)
print('x_test: ',x_test.shape)

#-----------------------------------------------------------------------------------------------------------------------------

y_train = np.array(y_train)
y_train = y_train.reshape(-1,1)
y_train = pd.DataFrame(y_train)

y_test = np.array(y_test)
y_test = y_test.reshape(-1,1)
y_test = pd.DataFrame(y_test)


print("y_train: ",y_train.shape)
print('y_test:',y_test.shape)

x_train:  (52560, 545)
x_test:  (23280, 545)
y_train:  (52560, 1)
y_test: (23280, 1)


In [6]:
x_train.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,535,536,537,538,539,540,541,542,543,544
count,52560.0,52560.0,52560.0,52560.0,52560.0,52560.0,52560.0,52560.0,52560.0,52560.0,...,52560.0,52560.0,52560.0,52560.0,52560.0,52560.0,52560.0,52560.0,52560.0,52560.0
mean,1.076636,1.780137,2.172945,2.465373,2.70489,2.917218,3.091781,3.254357,3.409817,3.561986,...,0.011929,0.011358,0.044463,0.013527,0.412005,0.039593,0.005156,0.018893,0.105327,0.010312
std,2.72574,3.661912,4.317834,4.826208,5.233899,5.609713,5.895678,6.182278,6.456202,6.730221,...,0.111849,0.122462,0.231432,0.183168,1.024046,0.253539,0.07895,0.15497,0.408786,0.112771
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,2.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,152.0,202.0,221.0,230.0,238.0,238.0,254.0,267.0,272.0,277.0,...,2.0,11.0,7.0,26.0,32.0,13.0,4.0,5.0,10.0,3.0


## **scaling the data to control the standard deviation**

In [7]:
stdScaler = StandardScaler()
x_train = stdScaler.fit_transform(x_train)
x_test = stdScaler.transform(x_test)

In [8]:
x_train.std()

0.9999999999999921

## **removing the unwanted Features**

In [9]:
pca = PCA(n_components=.85)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [10]:
np.linalg.det(np.cov(x_train_pca,rowvar=0))

0.0036946545572521207

### **using SMOT to balancing the data (i.e., number of  true = false )**

In [11]:
sm = SMOTE(random_state = 2)
x_train_new, y_train_new = sm.fit_resample(x_train_pca, y_train)

print('After OverSampling, the shape of train_X: {}'.format(x_train_new.shape))
print('After OverSampling, the shape of train_y: {}'.format(y_train_new.shape))

After OverSampling, the shape of train_X: (99310, 366)
After OverSampling, the shape of train_y: (99310, 1)


## **Implementing the gausian  Naive Bayes**

In [12]:
from sklearn.naive_bayes import GaussianNB

GNB = GaussianNB()
GNB.fit(x_train_new,y_train_new.values.ravel())
y_pred=GNB.predict(x_test_pca)
print(classification_report(y_pred,y_test))
GNB_predict_prob = GNB.predict_proba(x_test_pca)
GNB_predict_prob.shape

              precision    recall  f1-score   support

           0       0.85      0.93      0.89     19186
           1       0.38      0.21      0.28      4094

    accuracy                           0.80     23280
   macro avg       0.62      0.57      0.58     23280
weighted avg       0.77      0.80      0.78     23280

[[17782  1404]
 [ 3216   878]]


(23280, 2)

## **Implementing the Logistic Regression**

In [19]:
lr=LogisticRegression()
lr.fit(x_train_new,y_train_new.values.ravel())
y_pred=lr.predict(x_test_pca)
print(classification_report(y_pred,y_test))
lr_predict_prob = lr.predict_proba(x_test_pca)
lr_predict_prob.shape

              precision    recall  f1-score   support

           0       0.76      0.95      0.84     16851
           1       0.62      0.22      0.32      6429

    accuracy                           0.75     23280
   macro avg       0.69      0.58      0.58     23280
weighted avg       0.72      0.75      0.70     23280

[[15978   873]
 [ 5020  1409]]


(23280, 2)

## **Implementing the KNeighborsClassifier**

In [14]:
knn = KNeighborsClassifier()
knn.fit(x_train_new,y_train_new.values.ravel())
y_pred=knn.predict(x_test_pca)
print(classification_report(y_pred,y_test))
knn_predict_prob = knn.predict_proba(x_test_pca)
knn_predict_prob.shape

              precision    recall  f1-score   support

           0       0.87      0.92      0.90     19889
           1       0.32      0.22      0.26      3391

    accuracy                           0.82     23280
   macro avg       0.60      0.57      0.58     23280
weighted avg       0.79      0.82      0.80     23280



(23280, 2)

## **Implementing SVM**

In [15]:
# commented because it is taking too much time in my system to Run 
# svm = SVC()
# svm.fit(x_train_new,y_train_new.values.ravel())
# y_pred=svm.predict(x_test_pca)
# print(classification_report(y_pred,y_test))
# svm_predict_prob = svm.predict_proba(x_test_pca)
# svm_predict_prob.shape

## **Find the average probability of the (KNN_probability_matrics+Logistic_reg._probability_matrics+GausianNB_probability_matrics)**

In [16]:
avgerage_probablity=np.divide(np.add(knn_predict_prob,lr_predict_prob,GNB_predict_prob),3)
y_pred = np.argmax(avgerage_probablity,axis=1)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90     19797
           1       0.39      0.25      0.31      3483

    accuracy                           0.83     23280
   macro avg       0.63      0.59      0.60     23280
weighted avg       0.80      0.83      0.81     23280



## **Implementing Decision Tree Classifiers**

In [17]:
from sklearn.tree import DecisionTreeClassifier

dTree = DecisionTreeClassifier()
dTree.fit(x_train_new,y_train_new.values.ravel())
y_pred=dTree.predict(x_test_pca)
print(classification_report(y_pred,y_test))
dTree_predict_prob = dTree.predict_proba(x_test_pca)
dTree_predict_prob.shape

              precision    recall  f1-score   support

           0       0.80      0.92      0.86     18331
           1       0.35      0.16      0.22      4949

    accuracy                           0.76     23280
   macro avg       0.58      0.54      0.54     23280
weighted avg       0.71      0.76      0.72     23280



(23280, 2)

## **Implementing Random Forest Classifiers**

In [18]:
from sklearn.ensemble import RandomForestClassifier

R_Forest = RandomForestClassifier()
R_Forest.fit(x_train_new,y_train_new.values.ravel())
y_pred=R_Forest.predict(x_test_pca)
print(classification_report(y_pred,y_test))
R_Forest_predict_prob = R_Forest.predict_proba(x_test_pca)
R_Forest_predict_prob.shape

              precision    recall  f1-score   support

           0       0.96      0.91      0.94     22170
           1       0.16      0.33      0.21      1110

    accuracy                           0.89     23280
   macro avg       0.56      0.62      0.58     23280
weighted avg       0.93      0.89      0.90     23280



(23280, 2)