<a href="https://colab.research.google.com/github/awesome786engineer/Machine-Learning-Projects/blob/main/InstaFakeID_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from huggingface_hub import login
login()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
from datasets import load_dataset
dataset = load_dataset("nahiar/instagram_bot_detection")

In [None]:
df = dataset['train'].to_pandas()
df.head()

In [None]:
df.shape

In [None]:
# shuffle the DF before applying train test split
df_shuffled = df.sample(frac = 1, random_state = 42).reset_index(drop = True)

splitting_point = int(0.8*len(df))
train_df = df_shuffled.iloc[:splitting_point]
test_df = df_shuffled.iloc[splitting_point:]

In [None]:
train_df = train_df.sample(frac = 1,random_state = 42).reset_index(drop = True)
test_df = test_df.sample(frac = 1,random_state = 42).reset_index(drop = True )

In [None]:
train_df.head()


In [None]:
test_df.head()

In [None]:
len(train_df),len(test_df)

Here "nums/length" and nums/length_full_name" is ratio of numerical characters in its user name and its full name

## MLWorkFLow ##

1. Data preprocessing used

    -  missing numerical values were replaced by mediian values reducing effect of  outliers
    -  missing categorical values(eg profile picture ) were replaced with mode  
2. Data transformation

    - apply one hot encoding for features "profile pic","private","external URL", presence = 1, absence = 0

3. Outlier Detection and Removal
    
    - use interquartile range or z- score method to detect and remove outlier that might skew the model performance

4. Feature selection

    - to avoid multicollinearity using "CORRELATION MATRIX"
    - lasso (L1) regularization to select importnat features by shrinking the less important features coefficients to zero
    - use RECURSIVE FEATURE ELEMINATION to select top performin features by iteratively training and eliminating weaker features

5. Normalization

    - normalize high performing features

6. Model building using
    - for now logistic regression only
    - later implement 1. KNN ,2. Random Forest,3. SVC(support vector classifier)




# OUTLIER DETECTION AND REMOVAL #

In [None]:
# plot the distribution of all numeric features
binary_columns = ["profile pic","name==username","external URL","private","fake"]
non_binary_columns = [col for col in train_df.columns if col not in binary_columns]

In [None]:
len(non_binary_columns)

In [None]:
non_binary_columns

In [None]:
for col in non_binary_columns[1:]:
  sns.histplot(data = train_df,x = col,kde = True)
  plt.show

In [None]:
# for i in range(5):
#   sns.histplot(data = train_df,x = non_binary_columns[i],kde = True)
#   plt.show()

In [None]:
# calculating skewness of all the attributes
train_df[non_binary_columns].skew()

In [None]:
train_df.corr()

In [None]:
selected_columns = ['profile pic','nums/length username','fullname words','description length','external URL']

# Applying IQR range for Outlier Detection #

In [None]:
# making box plot for all the non_binary columns
# for col in non_binary_columns:
#   train_df.boxplot(column = col)
#   plt.show()

## capping the outliers #

In [None]:
new_train_df = train_df.copy()
new_test_df = test_df.copy()

In [None]:
def filter_iqr(original_df,col,new_df,train = True):
  Q1 = original_df[col].quantile(0.25)
  Q3 = original_df[col].quantile(0.75)
  IQR = Q3 - Q1
  lower_limit = Q1 - 1.5 * IQR
  upper_limit = Q3 + 1.5 * IQR
  if train:
    new_df[col] = original_df[col].clip(lower = lower_limit ,upper = upper_limit)
  else:
    new_df[col] = test_df[col].clip(lower = lower_limit ,upper = upper_limit)
  return (lower_limit,upper_limit)


In [None]:
from collections import defaultdict
quartile_limits = defaultdict(tuple)
for col in non_binary_columns:
  quartile_limits[col] = filter_iqr(train_df,col,new_train_df)
  filter_iqr(train_df,col,new_test_df,train = False)

In [None]:
for col in selected_columns:
  print(col + " : " + str(quartile_limits[col]))

In [None]:
new_train_df[non_binary_columns].skew()

In [None]:
new_test_df[non_binary_columns].skew()

# Applying Logistic regression on IQR filtered data #

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
X_train = new_train_df.drop('fake',axis = 1)
y_train = new_train_df['fake']
X_test = new_test_df.drop('fake',axis = 1)
y_test = new_test_df['fake']


# Logistic regression without sklearn #

In [None]:
def sigmoid(z):
  return 1/(1 + np.exp(-z))

In [None]:
def compute_cost_vectorized(X, w, b, y):
  m = X.shape[0]
  z = X @ w + b  # (m,n) @ (n,) -> (m,). Vectorized dot product for all examples!
  f_wb = sigmoid(z) # (m,) vector of all predictions

  # Element-wise operations on the entire vectors
  cost = -y * np.log(f_wb) - (1 - y) * np.log(1 - f_wb)
  total_cost = np.sum(cost) / m

  return total_cost


In [None]:
def gradient_functions_vectorized(X, w, b, y):
  m, n = X.shape
  z = X @ w + b      # (m,) vector of z for all examples
  f_wb = sigmoid(z)  # (m,) vector of all predictions

  error = f_wb - y   # (m,) vector of all errors

  # (m,) * (m, n) is not what we want. We need (n,) result.
  # So we do (n, m) @ (m,) -> (n,)
  dj_dw = (X.T @ error) / m # transpose of X(m,n) is X.T(n,m)
  dj_db = np.sum(error) / m

  return dj_dw, dj_db

In [None]:
def train_vectorized(X, y, alpha, num_iters):
  cost_history =[]
  w = np.zeros(X.shape[1])
  b = 0
  for i in range(num_iters):
    dj_dw, dj_db = gradient_functions_vectorized(X, w, b, y)
    w = w - alpha * dj_dw
    b = b - alpha * dj_db

    if i % 1000 == 0: # Check cost less frequently to save time
        cost = compute_cost_vectorized(X, w, b, y)
        cost_history.append(cost)
        print(f"Iteration {i:5d}: Cost {cost:0.4f}")

  return w, b, cost_history


In [None]:
def predict_vectorized(X, w, b):
  z = X @ w + b
  f_wb = sigmoid(z)
  p = f_wb >= 0.5 # Creates a boolean array
  return p.astype(int)

In [None]:
#w, b,cost_history = train(X_train[selected_columns].values,y_train,0.01,15000)

In [None]:
w, b,cost_history = train_vectorized(X_train[selected_columns].values,y_train,0.01,15000)

In [None]:
y_pred = predict_vectorized(X_test[selected_columns].values,w,b)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(
    range(len(cost_history)),  # X-axis: iteration numbers (0 to 9999)
    cost_history,              # Y-axis: cost values
    color='blue',
    linestyle='solid',
    linewidth=2
)

plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Cost', fontsize=12)
plt.title('Learning Curve (Cost vs. Iterations)', fontsize=14)
plt.show()


In [None]:
print(classification_report(y_test, y_pred))

In [None]:
w,b

In [None]:
# testing on real account
w = np.array([-2.80260132,  3.60909299, -0.35848741, -0.02805083, -1.33505898])
b = 2.659345278516536

In [None]:
for col in selected_columns:
  print(col + " : " + str(quartile_limits[col]))