# Binary Classification - Humans vs Bots

## Path

In [None]:
SNA_PROJECT_PATH = "/home/sna_bros/SNA_Project"

## Imports

In [None]:
import pandas as pd
import seaborn as sns
import random as r

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

## Preparation of the training and test set

In [None]:
subnetwork_name = "ukraine"
sub_df = pd.read_csv(f"{SNA_PROJECT_PATH}/{subnetwork_name}/{subnetwork_name}_measures.csv")
sub_df.head()

In [None]:
sub_users = sub_df['UserID']

In [None]:
x = sub_df.drop(['UserID','label'], axis=1)  
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [None]:
y = sub_df['label']
y = pd.Series([1 if l=='bot' else 0 for l in y])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Naive Bayes Classifier

In [None]:
nbc = GaussianNB()
nbc.fit(x_train, y_train)

In [None]:
y_pred = nbc.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
hm = sns.heatmap(confusion_matrix(y_test, y_pred, normalize='all'), annot=True)

### Random Forest Classifier

In [None]:
clf = RandomForestClassifier(
    n_estimators=100,      
    max_depth=None,        
    random_state=42,
    class_weight='balanced' 
)
clf.fit(x_train, y_train)

In [None]:
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
hm = sns.heatmap(confusion_matrix(y_test, y_pred, normalize='all'), annot=True)

### Logistic Regression Classifier

In [None]:
lrc = LogisticRegression(
    class_weight='balanced',
    max_iter=1000
)
lrc.fit(x_train, y_train)

In [None]:
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
hm = sns.heatmap(confusion_matrix(y_test, y_pred, normalize='all'), annot=True)

### Sampling Complete network for further testing

In [None]:
complete_df = pd.read_csv(f"{SNA_PROJECT_PATH}/complete/complete_measures.csv")
complete_df.head()

In [None]:
remaining_users = [ x for x in complete_df['UserID'] if x not in sub_users]
sample = r.sample(remaining_users, 5000)
sample_df = complete_df[complete_df['UserID'].isin(sample)]
sample_df.columns = complete_df.columns
sample_df.head()

In [None]:
sample_x = sample_df.drop(['UserID','label'], axis=1)  
scaler = StandardScaler()
sample_x = scaler.fit_transform(sample_x)

In [None]:
sample_y = sample_df['label']
sample_y = pd.Series([1 if l=='bot' else 0 for l in sample_y])

### Test Naive Bayes

In [None]:
sample_y_pred = clf.predict(sample_x)
print(classification_report(sample_y, sample_y_pred))
hm = sns.heatmap(confusion_matrix(sample_y, sample_y_pred, normalize='all'), annot=True)

#### Test Random Forest

In [None]:
sample_y_pred = nbc.predict(sample_x)
print(classification_report(sample_y, sample_y_pred))
hm = sns.heatmap(confusion_matrix(sample_y, sample_y_pred, normalize='all'), annot=True)

#### Test Logistic Regressor

In [None]:
sample_y_pred = lrc.predict(sample_x)
print(classification_report(sample_y, sample_y_pred))
hm = sns.heatmap(confusion_matrix(sample_y, sample_y_pred, normalize='all'), annot=True)

## Training using the entire network

### Splitting Complete Network

In [None]:
x = complete_df.drop(['UserID','label'], axis=1)  
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [None]:
y = complete_df['label']
y = pd.Series([1 if l=='bot' else 0 for l in y])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Complete network using Naive Bayes

In [None]:
nbc = GaussianNB()
nbc.fit(x_train, y_train)

In [None]:
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
hm = sns.heatmap(confusion_matrix(y_test, y_pred, normalize='all'), annot=True)

### Complete network using Random Forest

In [None]:
clf = RandomForestClassifier(
    n_estimators=100,      
    max_depth=None,        
    random_state=42,
    class_weight='balanced' 
)
clf.fit(x_train, y_train)

In [None]:
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
hm = sns.heatmap(confusion_matrix(y_test, y_pred, normalize='all'), annot=True)

### Complete network using Logistic Regressor

In [None]:
lrc = LogisticRegression(
    class_weight='balanced',
    max_iter=1000
)
lrc.fit(x_train, y_train)

In [None]:
y_pred = lrc.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
hm = sns.heatmap(confusion_matrix(y_test, y_pred, normalize='all'), annot=True)