## Reading Input data
- Setting Header 0 as no header is present in first row.
- setting encoding as UTF-16 as reading with default UTF-8 lead to time column giving unreadable values.
- Seprating Training columns from Target columns.

In [10]:
from sklearn import *
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import pickle
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

data = pd.read_csv('23OCT_23NOV_10.193.123.18_final.csv.csv', header=0, encoding='UTF-16')
dataC = data
target = data[['PacketDrop_Current_Value','PacketDrop_Severity']]
trainData = data.drop(columns = ['PacketDrop_Current_Value','PacketDrop_Severity'])
trainData.shape

(6559, 69)

## Analyzing numeric data.
- Finding total number of null values
- As we can see there are only 6 columns having null.
- Only 30 values on average in these columns are null.
- Replacing these values will 0 will damage model prediction
- Deleting these rows from the actual data.

In [11]:
numericdata = trainData._get_numeric_data()

# Getting columns having null values
dataWithNull = numericdata.isnull().sum()>0
nullValues = numericdata[dataWithNull.index[dataWithNull]]
mask=False
for col in nullValues.columns: mask = mask | nullValues[col].isnull()
dfnulls = nullValues[mask]
dfnulls

Unnamed: 0,UplinkJitter_Current_Value,UplinkJitter_Min_Current_Value,UplinkJitter_Max_Current_Value,UplinkRSSI_Current_Value,UplinkRSSI_Min_Current_Value,UplinkRSSI_Max_Current_Value
766,,,,,,
981,,,,,,
1101,,,,,,
1283,,,,,,
2072,3.0,3.0,3.0,,,
2264,,,,,,
2432,,,,,,
2525,,,,-62.0,-62.0,-62.0
2548,,,,,,
2982,,,,,,


In [12]:
numericdata=numericdata.dropna(axis=0)
trainData=trainData.dropna(axis=0)
numericdata.shape

(6526, 51)

## Describing categorical data to check number of unique entires in each column.
- Dropping time column as it will have more unique values.
- Summing up all the unique values in categorical variable to check number of unique columns which would be added after using one hot encoding.
- Using get_dummies function of pandas to convert categorical string to integers.

In [13]:
categorical = trainData.drop(numericdata.columns, axis=1)
# Dropping time Field
categorical = categorical.drop(columns=['Time'])
print("Number of unique categorical values",categorical.describe()[1:2].sum(axis=1))
print(pd.get_dummies(trainData[categorical.columns]).shape)

# Processed categorical data.
categorical = pd.get_dummies(trainData[categorical.columns])

Number of unique categorical values unique    27.0
dtype: float64
(6526, 27)


## Merging both categorical and numeric data.

In [14]:
trainData = pd.concat([categorical, numericdata], axis=1)
X = trainData

## Encoding target variable using Label Encoder.

In [15]:
data=data.dropna(axis=0)
y = data['PacketDrop_Severity']
le2 = preprocessing.LabelEncoder()
y = le2.fit_transform(data['PacketDrop_Severity'])
y.shape

(6526,)

## Splitting and Scalling Data
- Splittting data in 80:20 train : test.
- Using min max scaler and normailizing the values.

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Model Building
- Using GradientBoostingClassifier 
- Printing classification report so as to get detailed summary,
- Cross Validation gives us results in multiple train test datasets.

In [21]:
from sklearn.model_selection import GridSearchCV

pipe_long = Pipeline([("scaler", MinMaxScaler()),("GBC",GradientBoostingClassifier()))])
param_grid = {'GBC__learning_rate': [0.01, 0.1, 1, 10, 100], 'GBC__max_depth':[1,2,3]}
grid = GridSearchCV(pipe_long, param_grid, cv=5)
grid.fit(X_train, y_train)
print("Score without poly features: {:.2f}".format(grid.score(X_test, y_test)))
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))

SyntaxError: invalid syntax (<ipython-input-21-b5dbb7b557d4>, line 3)

In [8]:
clf = GradientBoostingClassifier(max_depth=1, learning_rate=0.1, )
# clf = RandomForestClassifier(max_depth=5, n_estimators=100)
clf.fit(X_train, y_train)


print(accuracy_score(clf.predict(X_test),y_test))
my_op = clf.predict(X_test)
y_test = le2.inverse_transform(y_test)
my_op = le2.inverse_transform(my_op)


print(confusion_matrix(y_test,my_op))
print(classification_report(y_test, my_op))
print(cross_val_score(clf, X, y, cv=5))

1.0
[[   2    0    0    0]
 [   0   27    0    0]
 [   0    0 1275    0]
 [   0    0    0    2]]
             precision    recall  f1-score   support

   critical       1.00      1.00      1.00         2
       down       1.00      1.00      1.00        27
         up       1.00      1.00      1.00      1275

avg / total       1.00      1.00      1.00      1306



  if diff:
  if diff:


[1.        0.9992343 1.        1.        1.       ]


## Saving Model
- Saving it so that we dont need to train again for using it on different data set.

In [118]:
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

## Running Test on server logs of different time frame.

In [122]:
data = pd.read_csv('27_Sep_23_Oct_10.193.123.18_final.csv', header=0,  encoding='UTF-16').dropna(axis=0)
trainData = data.drop(columns = ['PacketDrop_Current_Value','PacketDrop_Severity'])
numericdata = trainData._get_numeric_data()
categorical = trainData.drop(numericdata.columns, axis=1)
# Dropping time Field
categorical = categorical.drop(columns=['Time'])
print("Number of unique categorical values",categorical.describe()[1:2].sum(axis=1))
print(pd.get_dummies(trainData[categorical.columns]).shape)

# Processed categorical data.
categorical = pd.get_dummies(trainData[categorical.columns])
trainData = pd.concat([categorical, numericdata], axis=1)
X_test = trainData
X_test = scaler.transform(X_test)
y = data['PacketDrop_Severity']
le2 = preprocessing.LabelEncoder()
y_test = le2.fit_transform(data['PacketDrop_Severity'])

Number of unique categorical values unique    27.0
dtype: float64
(3659, 27)


## Classification report on new Dataset
- Since we see similar performance on new data set we are quite sure that we didnt overfit the model.

In [125]:
my_op = clf.predict(X_test)

print(confusion_matrix(y_test,my_op))
print(classification_report(y_test, my_op))

[[   3    0    0    0]
 [   0   16    0    0]
 [   0    0 3637    0]
 [   0    0    0    3]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00         3
          1       1.00      1.00      1.00        16
          2       1.00      1.00      1.00      3637
          3       1.00      1.00      1.00         3

avg / total       1.00      1.00      1.00      3659

