# Data 245 - Machine Learning Project 

# Internet Downtime Prediction Analysis using ML Techniques

### Presented By: Group 6 (Bhavik Patel, Poojan Gagrani, Kashish Thakur, Yuti Khamker)

## 1. Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mp
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.utils import resample, shuffle
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import joblib
import warnings
warnings.filterwarnings('ignore')

## 2. Reading file

In [None]:
df = pd.read_csv('/Users/bhavikpatel/Desktop/poject MSDA/Data 245/Project/Data/Outage_Data.csv')

In [None]:
df = df.sample(n=1000000, random_state=42).reset_index(drop=True)

## 3. Data understanding

### Original feature description from data source, Ref: https://wiki.mozilla.org/Mozilla_Network_Outages_Data_Project

`country`: the Country code of the client.

`city`: the City name (only for cities with a population >= 15000, 'unknown' otherwise).

`datetime`: the date and the time (truncated to hour) the data was submitted by the client.

`proportion_undefined`: the proportion of users who failed to send telemetry for a reason that was not listed in the other cases.

`proportion_timeout`: the proportion of users that had their connection timeout while uploading telemetry (after 90s, in Firefox Desktop).

`proportion_abort`: the proportion of users that had their connection terminated by the client (for example, terminating open connections before shutting down).

`proportion_unreachable`: the proportion of users that failed to upload telemetry because the server was not reachable (e.g. because the host was not reachable, proxy problems or OS waking up after a suspension).

`proportion_terminated`: the proportion of users that had their connection terminated internally by the networking code.

`proportion_channel_open`: the proportion of users for which the upload request was terminated immediately, by the client, because of a Necko internal error.

`avg_dns_success_time`: the average time it takes for a successful DNS resolution, in milliseconds.

`missing_dns_success`: counts how many sessions did not report the `DNS_LOOKUP_TIME` histogram.

`avg_dns_failure_time`: the average time it takes for an unsuccessful DNS resolution, in milliseconds.

`missing_dns_failure`: counts how many sessions did not report the `DNS_FAILED_LOOKUP_TIME` histogram.

`count_dns_failure`: the average count of unsuccessful DNS resolutions reported.

`ssl_error_prop`: the proportion of users that reported an error through the `SSL_CERT_VERIFICATION_ERRORS` histogram.

`avg_tls_handshake_time`: the average time after the TCP SYN to ready for HTTP, in milliseconds.

### Defining dataset

In [None]:
df.head()

**Description**

Showing first 5 values of the dataframe.

In [None]:
df.tail()

**Description**

Showing last 5 values of the dataframe.

In [None]:
df.info()

**Description**

Checking datatypes of the attributes.

In [None]:
df.describe()

**Description**

Showing descriptive statistics of the dataframe.

In [None]:
df.nunique()

**Description**

Showing unique values of the dataframe.

In [None]:
df.isnull().sum()

**Description**

Checking the null values in the dataframe.

## 4. Data Quality Report

### Data quality for continuous features

In [None]:
continuous_features = [
    'proportion_undefined', 'proportion_timeout', 'proportion_abort',
    'proportion_unreachable', 'proportion_terminated', 'proportion_channel_open',
    'avg_dns_success_time', 'avg_dns_failure_time', 'count_dns_failure',
    'ssl_error_prop', 'avg_tls_handshake_time'
]

data_quality_report = pd.DataFrame(index=continuous_features)

data_quality_report['Count'] = df[continuous_features].count()

data_quality_report['Missing Values in %'] = (1 - (df[continuous_features].count() / len(df))) * 100

data_quality_report['Cardinality'] = df[continuous_features].nunique()

data_quality_report['Minimum'] = df[continuous_features].min()

data_quality_report['Quartile 1'] = df[continuous_features].quantile(0.25)

data_quality_report['Mean'] = df[continuous_features].mean()

data_quality_report['Median'] = df[continuous_features].median()

data_quality_report['Quartile 3'] = df[continuous_features].quantile(0.75)

data_quality_report['Maximum'] = df[continuous_features].max()

data_quality_report['Standard Deviation'] = df[continuous_features].std()

data_quality_report

### Data quality for categorical features

In [None]:
categorical_features = ['country', 'city']

data_quality_report_categorical = pd.DataFrame(index=categorical_features)

data_quality_report_categorical['Count'] = df[categorical_features].count()

data_quality_report_categorical['Missing Values in %'] = (1 - (df[categorical_features].count() / len(df))) * 100

data_quality_report_categorical['Cardinality'] = df[categorical_features].nunique()

data_quality_report_categorical['Mode'] = df[categorical_features].mode().iloc[0]

data_quality_report_categorical['Mode Frequency'] = df[categorical_features].apply(lambda x: x.value_counts().iloc[0])

data_quality_report_categorical['Mode in %'] = (df[categorical_features].apply(lambda x: x.value_counts().iloc[0]) / len(df)) * 100

data_quality_report_categorical['2nd Mode'] = df[categorical_features].apply(lambda x: x.value_counts().index[1] if len(x.value_counts()) > 1 else 'N/A')

data_quality_report_categorical['2nd Mode Frequency'] = df[categorical_features].apply(lambda x: x.value_counts().iloc[1] if len(x.value_counts()) > 1 else 'N/A')

data_quality_report_categorical['2nd Mode in %'] = (df[categorical_features].apply(lambda x: x.value_counts().iloc[1] if len(x.value_counts()) > 1 else 'N/A') / len(df)) * 100

data_quality_report_categorical['3rd Mode'] = df[categorical_features].apply(lambda x: x.value_counts().index[2] if len(x.value_counts()) > 1 else 'N/A')

data_quality_report_categorical['3rd Mode Frequency'] = df[categorical_features].apply(lambda x: x.value_counts().iloc[2] if len(x.value_counts()) > 1 else 'N/A')

data_quality_report_categorical['3rd Mode in %'] = (df[categorical_features].apply(lambda x: x.value_counts().iloc[2] if len(x.value_counts()) > 1 else 'N/A') / len(df)) * 100

data_quality_report_categorical

## 5. Initial Exploratory Data Analysis 

In [None]:
sns.set_style("whitegrid")

selected_columns = [
    'proportion_timeout', 'proportion_abort', 'proportion_unreachable',
    'proportion_terminated', 'avg_dns_success_time', 'avg_dns_failure_time',
    'count_dns_failure', 'ssl_error_prop', 'avg_tls_handshake_time'
]

plt.figure(figsize=(15, 15))
for i, col in enumerate(selected_columns, 1):
    plt.subplot(3, 3, i)
    sns.histplot(df[col], bins=50, kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel('Frequency')
    if 'proportion' in col:
        plt.xlim(0, df[col].max()) 
    elif 'avg' in col:
        plt.xlim(0, df[col].max())
    elif 'count' in col:
        plt.xlim(0, df[col].max()) 

plt.tight_layout()
plt.show()

**Description**

The histogram above demonstrates the ditribution of the continuous features present in the dataset. Here we can observe that most of the proportion values lie between 0 and 1 and they have some values higher than 1 which can possibly be outliers. However, avg_tls_handshake_time, avg_dns_success_time and avg_dns_failure_time have much higher values present the reason behind this is that they are recorded in milliseconds and could be tranformed, if required.

In [None]:
proportion_features = [col for col in df.columns if "proportion" in col]

plt.figure(figsize=(15, 10))
sns.set_style("whitegrid")

for i, feature in enumerate(proportion_features, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=df[feature])
    plt.ylim(-0.1, 1.1) 
    plt.title(feature)
    plt.ylabel("")

plt.tight_layout()
plt.show()

**Description**

The boxplots above visualizes the spread of data for proportion of the undefined, timeout, abort, unreachable, terminated, channel_open features respectively. They are necessary to understand our target feature as they're the set of features that captures the outcomes of the telemetry signals captured from the host machines. Here, we can observe that proportion_unreachable has the highest spread of values and also aligns with our target feature requirement as it captures the failure of upload of the telemetary signals indicating the possible outage.

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))

sns.scatterplot(data=df, x='proportion_unreachable', y='proportion_timeout', ax=axes[0, 0])
axes[0, 0].set_title('Proportion Unreachable vs Timeout')

sns.scatterplot(data=df, x='proportion_unreachable', y='proportion_terminated', ax=axes[0, 1])
axes[0, 1].set_title('Proportion Unreachable vs Terminated')


sns.scatterplot(data=df, x='proportion_unreachable', y='proportion_abort', ax=axes[1, 0])
axes[1, 0].set_title('Proportion Unreachable vs Proportion Abort')

sns.scatterplot(data=df, x='proportion_unreachable', y='proportion_channel_open', ax=axes[1, 1])
axes[1, 1].set_title('Proportion Unreachable vs Proportion Channel Open')

sns.scatterplot(data=df, x='proportion_unreachable', y='count_dns_failure', ax=axes[2,0])
axes[2,0].set_title('Proportion Unreachable vs Count DNS Failure')

sns.scatterplot(data=df, x='proportion_unreachable', y='avg_tls_handshake_time', ax=axes[2,1])
axes[2,1].set_title('Proportion Unreachable vs Avg TLS Handshake Time')

plt.tight_layout()
plt.show()

**Description**

The scatterplot above demonstrates the spread of the continuous features with respect to proportion_unreachable which is our target feature, this is to understand the correlation and the density of the values lying amongst the features.

In [None]:
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
df['country'].value_counts().head(15).plot(kind='bar', color='skyblue')
plt.title('Top 15 Countries by Data Count')
plt.xlabel('Country')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
df['city'].value_counts().head(15).plot(kind='bar', color='lightcoral')
plt.title('Top 15 Cities by Data Count')
plt.xlabel('City')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

**Description**

The barplots above are used to demonstrate the top 15 countries and cities with highest data counts. It's quite significant that the most of the data is captures from United States followed by Germany, France and China. It's important to note that the cities have highest count for unknown as the cities with population less than 15,000 are labelled as unknows as originally mentioned by the data owners. 

In [None]:
selected_numeric_features = [
    'proportion_timeout', 'proportion_abort', 'proportion_unreachable',
    'proportion_terminated', 'avg_dns_success_time', 'avg_dns_failure_time',
    'count_dns_failure', 'ssl_error_prop', 'avg_tls_handshake_time'
]


correlation_matrix = df[selected_numeric_features].corr()

plt.figure(figsize=(10, 7))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

**Description**

The heatmap above is used to understand the correlation between continuous features in the dataset. We can see that most of the features have positive correlation. However there are some features which have little negative correlation between them.

## 6. Data Cleaning & Transformation

### Handling country with null values

In [None]:
df_null_country = df[df['country'].isnull()]

In [None]:
df_null_country['city'].unique()

In [None]:
city_windhoek_count = df_null_country['city'].value_counts()['Windhoek']
city_unknown_count =  df_null_country['city'].value_counts()['unknown']

total = (city_windhoek_count + city_unknown_count)

print('Windhoek count = {} and unknown count = {}'.format(city_windhoek_count, city_unknown_count))

In [None]:
df.loc[df["city"] == "Windhoek", "country"] = 'NA'
df_null_country.loc[df_null_country["city"] == "Windhoek", "country"] = 'NA'

**Description**

Assigning the `country` value 'NA' i.e., Namibia where `city` is Windhoek.

In [None]:
df.isnull().sum()

In [None]:
df_null_country[df_null_country['country'].isnull()]

In [None]:
df.dropna(subset=['country'], inplace=True)

**Description**

Dropping all the remainder countries having null values as city is also unknown and the data count is significantly small.

In [None]:
df.isnull().sum()

### Handling all the null values

In [None]:
df = df.dropna()

**Description**

Dropping all the null values as the data count for null values is quite small.


In [None]:
df.isnull().sum()

### Handling city with unknown values

In [None]:
city_unknown_count =  df['city'].value_counts()['unknown']

city_unknown_count

In [None]:
df_city_unknown = df[df['city'] == 'unknown']
df_city_unknown

## 7. Feature Engineering

### Define time slots

In [None]:
# Convert 'datetime' to 'hour'
df['datetime'] = pd.to_datetime(df['datetime'])
df['hour'] = df['datetime'].dt.hour

# Define time slots
def get_detailed_time_slot(hour):
    if 0 <= hour < 6:
        return 'Late Night'
    elif 6 <= hour < 9:
        return 'Early Morning'
    elif 9 <= hour < 12:
        return 'Late Morning'
    elif 12 <= hour < 15:
        return 'Early Afternoon'
    elif 15 <= hour < 18:
        return 'Late Afternoon'
    elif 18 <= hour < 21:
        return 'Early Evening'
    else:
        return 'Night'

df['time_slot'] = df['hour'].apply(get_detailed_time_slot) 

### Class Labeling (Data discretization) using composite score

In [None]:
# Example features that might contribute to an internet quality score
features = ['proportion_timeout', 'proportion_unreachable', 'proportion_terminated', 
            'avg_dns_failure_time', 'count_dns_failure']

# Create a composite score as a simple sum of standardized features
df['composite_score'] = df[features].apply(lambda x: (x - x.mean()) / x.std()).sum(axis=1)

# Calculate the quantiles on this composite score
quantiles = df['composite_score'].quantile([0.25, 0.5, 0.75])

# Define the labeling function with the correct quartile values
def label_quality(score, quantiles):
    if score <= quantiles[0.25]:
        return 'good'
    elif score <= quantiles[0.50]:
        return 'moderate'
    elif score <= quantiles[0.75]:
        return 'bad'
    else:
        return 'worse'

# Apply the labeling function to each row in your dataframe
df['quality_label'] = df['composite_score'].apply(label_quality, quantiles=quantiles)

# Map the categorical labels to integers
label_map = {'good': 0, 'moderate': 1, 'bad': 2, 'worse': 3}
df['quality_label_encoded'] = df['quality_label'].map(label_map)


In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df.head()

### Checking feature importance using RFE

In [None]:
X = df.drop(['quality_label', 'quality_label_encoded', 'datetime','time_slot', 'country', 'city'], axis=1)
y = df['quality_label_encoded']

# Create the RFE object and rank each pixel
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=10)
rfe.fit(X, y)

# RFE ranking
ranking_rfe = rfe.ranking_

# To map these rankings back to column names:
rfe_dict = dict(zip(X.columns, ranking_rfe))
sorted_rfe = sorted(rfe_dict.items(), key=lambda item: item[1])

# sorted_rfe now contains features and their RFE ranking, sorted from most to least important

In [None]:
print(sorted_rfe)

### Encoding the categorical columns

In [None]:
# Label encode 'country' and 'city'
label_encoder_country = LabelEncoder()
label_encoder_city = LabelEncoder()
ordinal_encoder_time_slot = OrdinalEncoder()

# Assuming 'data' is your DataFrame
df['country_encoded'] = label_encoder_country.fit_transform(df['country'])
df['city_encoded'] = label_encoder_city.fit_transform(df['city'])

# Assuming 'time_slot' is a categorical variable that you want to encode ordinally
df['time_slot_encoded'] = ordinal_encoder_time_slot.fit_transform(df[['time_slot']])

# For 'composite_score', first, we need to convert it into quartile bins
# Then we'll use ordinal encoding on these bins
#data['quality_label_encoded'] = pd.qcut(data['quality_label'], q=4, labels=False)

# Drop the original columns that have been encoded
df.drop(['country', 'city', 'time_slot', 'quality_label'], axis=1, inplace=True)

## 8. Splitting the dataset into test, train and validate  sets

In [None]:
# Features are separated: numerical features that need scaling and categorical encoded features that don't
numerical_features = ['proportion_timeout', 'proportion_unreachable', 'proportion_terminated', 
                      'avg_dns_success_time', 'avg_dns_failure_time', 'count_dns_failure', 'ssl_error_prop']
categorical_features = ['country_encoded', 'city_encoded', 'time_slot_encoded']
X_numerical = df[numerical_features]
X_categorical = df[categorical_features]

# Target variable
y = df['quality_label_encoded']

# Split the data into train+validate and test sets (90-10 split)
X_temp_num, X_test_num, y_temp, y_test = train_test_split(X_numerical, y, test_size=0.1, stratify=y, random_state=42, shuffle=True)
X_temp_cat, X_test_cat = train_test_split(X_categorical, test_size=0.1, random_state=42, shuffle=True)

# Further split the train+validate into train and validate sets (89-11 split, approximates to 80-10 of original)
X_train_num, X_validate_num, y_train, y_validate = train_test_split(X_temp_num, y_temp, test_size=1/9, stratify=y_temp, random_state=42, shuffle=True)
X_train_cat, X_validate_cat = train_test_split(X_temp_cat, test_size=1/9, random_state=42, shuffle=True)

In [None]:
X_train_num.shape

### Initialize the StandardScaler for numerical features

In [None]:
scaler = StandardScaler()

# Fit the scaler on the numerical part of the training data and transform
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_validate_num_scaled = scaler.transform(X_validate_num)
X_test_num_scaled = scaler.transform(X_test_num)

### Combining scaled continous and categorical features

In [None]:
X_train = np.concatenate((X_train_num_scaled, X_train_cat.values), axis=1)
X_validate = np.concatenate((X_validate_num_scaled, X_validate_cat.values), axis=1)
X_test = np.concatenate((X_test_num_scaled, X_test_cat.values), axis=1)

In [None]:
print('Continous features scaled shape', X_train_num_scaled.shape)
print('Categorical features shape', X_train_cat.shape)
print('X train shape', X_train.shape)

### Checking the count of each class in the target feature

In [None]:
# Check class distribution in the target feature
class_counts = y_train.value_counts()
print(class_counts)

# Decide to use SMOTE based on class distribution
# Generally, if any class is less than 10-20% of the majority class, SMOTE might be useful

### Applying SMOTE to the training set if the classes are imbalanced

In [None]:
# Initialize SMOTE and resample the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
import pandas as pd

# Convert to pandas Series for easy value counts (if not already a Series)
y_train_smote_series = pd.Series(y_train_smote)
label_distribution = y_train_smote_series.value_counts()
print(label_distribution)


## 9. Modeling using Support Vector Machine

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming X_train_smote, y_train_smote, X_validate, y_validate are already defined and scaled



In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Define C values
C_values = [0.1]
train_accuracies = []
validation_accuracies = []

# Loop over the C values
for C_value in C_values:
    svm_model = LinearSVC(C=C_value, random_state=42)
    svm_model.fit(X_train_smote, y_train_smote)

    # Training accuracy
    y_train_pred = svm_model.predict(X_train_smote)
    train_accuracy = accuracy_score(y_train_smote, y_train_pred)
    train_accuracies.append(train_accuracy)

    # Validation accuracy
    y_val_pred = svm_model.predict(X_validate)
    val_accuracy = accuracy_score(y_validate, y_val_pred)
    validation_accuracies.append(val_accuracy)

print(f"Accuracy for C={C_value}: Training = {train_accuracy * 100:.2f}%, Validation = {val_accuracy * 100:.2f}%")


## 10. Model Evaluation

###  Plotting accuracies for SVM

In [None]:
# Plotting accuracies
plt.figure(figsize=(10, 6))
plt.plot(C_values, train_accuracies, label='Training Accuracy', marker='o')
plt.plot(C_values, validation_accuracies, label='Validation Accuracy', marker='o')
plt.title('SVM Training and Validation Accuracy vs. C Parameter')
plt.xlabel('C Parameter')
plt.ylabel('Accuracy')
plt.legend()
plt.xscale('log')
plt.grid(True)
plt.show()

### Confusion Matrix

In [None]:
for C_value in C_values:
    svm_model = LinearSVC(C=C_value, random_state=42)
    svm_model.fit(X_train_smote, y_train_smote)
    y_val_pred = svm_model.predict(X_validate)
    cm = confusion_matrix(y_validate, y_val_pred)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix (SVM) - C={C_value}')
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.show()


In [None]:
# Compute and plot the confusion matrix
cm = confusion_matrix(y_validate, y_pred_validate)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix (SVM)')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()


### Classification Report:

In [None]:

# Generate a classification report
report = classification_report(y_validate, y_pred_validate)
print(report)


### Saving model

In [None]:
# Save the SVM model to a file
joblib.dump(svm_model, 'svm_model.joblib')


### Loading the Model:

In [None]:
# Load the SVM model from the file
loaded_svm_model = joblib.load('svm_model.joblib')

### Testing model

In [None]:
# Make predictions on the test set
y_pred_test = loaded_svm_model.predict(X_test)

# Evaluate the predictions
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy}")
