# Instagram Account Detection - Real or Scam
##### Author - Ansh Mani Tripathi

### 1. Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn import set_config
import joblib
import warnings


### 2. Loading and Analyzing Data

In [3]:
# Load the dataset
df = pd.read_csv('LIMFADD.csv')

In [4]:
df.shape

(15000, 11)

In [5]:
df.head()

Unnamed: 0,Followers,Following,Following/Followers,Posts,Posts/Followers,Bio,Profile Picture,External Link,Mutual Friends,Threads,Labels
0,2,2757,1378.5,0,0.0,N,N,N,0,N,Bot
1,2,505,252.5,0,0.0,N,Yes,N,0,N,Scam
2,6786,1782,0.262599469,1589,6051.040404,yes,N,Yes,10,N,Real
3,21,1281,61.0,0,0.0,N,Yes,N,0,N,Bot
4,585,1682,2.875213675,2663,926.1920333,yes,N,N,12,Yes,Real


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Followers            15000 non-null  int64 
 1   Following            15000 non-null  int64 
 2   Following/Followers  15000 non-null  object
 3   Posts                15000 non-null  int64 
 4   Posts/Followers      15000 non-null  object
 5   Bio                  15000 non-null  object
 6   Profile Picture      15000 non-null  object
 7   External Link        15000 non-null  object
 8   Mutual Friends       15000 non-null  int64 
 9   Threads              15000 non-null  object
 10  Labels               15000 non-null  object
dtypes: int64(4), object(7)
memory usage: 1.3+ MB


### 3. Data Preprocessing
 
 Here, we perform the necessary cleaning steps:
 - Strip whitespace from column names.
 - Convert ratio columns (`Following/Followers`, `Posts/Followers`) to numeric types.
 - Impute (fill) the `NaN` values with the median of their respective columns.
 - Encode binary text columns (`Bio`, `Profile Picture`, etc.) into `0` for 'N' and `1` for 'Yes'.

In [7]:
# Correcting potential typos in column names by stripping whitespace
df.columns = df.columns.str.strip()

In [8]:
# Convert ratio columns to numeric, coercing errors to NaN
df['Following/Followers'] = pd.to_numeric(df['Following/Followers'], errors='coerce')
df['Posts/Followers'] = pd.to_numeric(df['Posts/Followers'], errors='coerce')

In [9]:
# Impute NaN values with the median of the column
df['Following/Followers'] = df['Following/Followers'].fillna(df['Following/Followers'].median())
df['Posts/Followers'] = df['Posts/Followers'].fillna(df['Posts/Followers'].median())

In [10]:
# Encode binary columns: N=0, Yes=1
binary_cols = ['Bio', 'Profile Picture', 'External Link', 'Threads']
for col in binary_cols:
    df[col] = df[col].str.strip().str.lower().map({'n': 0, 'yes': 1})
    df[col] = df[col].fillna(df[col].mode()[0])

print("Preprocessing complete. Data types after cleaning:")
print(df.info())
print("\nCleaned DataFrame head:")
display(df.head())

Preprocessing complete. Data types after cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Followers            15000 non-null  int64  
 1   Following            15000 non-null  int64  
 2   Following/Followers  15000 non-null  float64
 3   Posts                15000 non-null  int64  
 4   Posts/Followers      15000 non-null  float64
 5   Bio                  15000 non-null  int64  
 6   Profile Picture      15000 non-null  int64  
 7   External Link        15000 non-null  int64  
 8   Mutual Friends       15000 non-null  int64  
 9   Threads              15000 non-null  int64  
 10  Labels               15000 non-null  object 
dtypes: float64(2), int64(8), object(1)
memory usage: 1.3+ MB
None

Cleaned DataFrame head:


Unnamed: 0,Followers,Following,Following/Followers,Posts,Posts/Followers,Bio,Profile Picture,External Link,Mutual Friends,Threads,Labels
0,2,2757,1378.5,0,0.0,0,0,0,0,0,Bot
1,2,505,252.5,0,0.0,0,1,0,0,0,Scam
2,6786,1782,0.262599,1589,6051.040404,1,0,1,10,0,Real
3,21,1281,61.0,0,0.0,0,1,0,0,0,Bot
4,585,1682,2.875214,2663,926.192033,1,0,0,12,1,Real


## 4. Feature and Target Definition
 
 We define our feature set `X` (all columns except 'Labels') and our target `y` ('Labels'). We then **encode the text labels in `y` into integers**.

In [11]:
X = df.drop('Labels', axis=1)
y_raw = df['Labels']

In [12]:
# --- Encode the target variable (y) ---
le = LabelEncoder()
y = le.fit_transform(y_raw)

In [13]:
# We can see the mapping
print("Label Encoder Mapping:")
for index, label in enumerate(le.classes_):
    print(f"{label}: {index}")

Label Encoder Mapping:
Bot: 0
Real: 1
Scam: 2
Spam: 3


In [14]:
numeric_features = ['Followers', 'Following', 'Following/Followers', 'Posts', 'Posts/Followers', 'Mutual Friends']
numeric_features.extend(binary_cols) # Add the now-numeric binary columns

print("\nFeatures (X) shape:", X.shape)
print("Target (y) shape:", y.shape)


Features (X) shape: (15000, 10)
Target (y) shape: (15000,)


## 5. Building the Preprocessing and Modeling Pipeline
 
 A `ColumnTransformer` is used to apply `StandardScaler` to our numeric features. This transformer is then placed as the first step in a `Pipeline`, followed by the Random Forest classifier.

In [15]:
# Create a preprocessor object using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ],
    remainder='passthrough' # Keep other columns (if any)
)

In [16]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

Training set size: 12000 samples
Test set size: 3000 samples


## 6. Model Training and Evaluation
 
 We will now create the full pipeline and train the Random Forest model.

In [18]:
# --- Training and Evaluating the Random Forest Model ---
# Create the full pipeline
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

In [19]:
# Train the model
print("Training the Random Forest model...")
rf_pipeline.fit(X_train, y_train)
print("Training complete.")

Training the Random Forest model...
Training complete.


In [20]:
y_pred = rf_pipeline.predict(X_test)

In [21]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")
print(f"F1-score (weighted): {f1_score(y_test, y_pred, average='weighted'):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9697
F1-score (weighted): 0.9697
Confusion Matrix:
[[739   0  11   0]
 [  0 701   0  49]
 [  1   0 749   0]
 [  0  30   0 720]]


## 7. Save the Model and Encoder
 
 The trained pipeline and the label encoder are saved to files. These are the two files our backend application will need to make predictions.

In [22]:
# Save the model pipeline
joblib.dump(rf_pipeline, 'instagram_account_detector.joblib')
print(f"The Random Forest pipeline has been saved as 'instagram_account_detector.joblib'")

# Save the label encoder
joblib.dump(le, 'label_encoder.joblib')
print(f"The label encoder has been saved as 'label_encoder.joblib'")

The Random Forest pipeline has been saved as 'instagram_account_detector.joblib'
The label encoder has been saved as 'label_encoder.joblib'


## 8. Visualize the Pipeline

 This diagram shows the two main steps of our model: the `preprocessor` which handles feature scaling, and the `classifier` which is our trained Random Forest model.

In [23]:
# Set the display configuration to 'diagram' to enable HTML representation
set_config(display='diagram')

In [24]:
# Display the pipeline object
rf_pipeline

## 9. Make a Prediction on New Data
 
 Here we simulate how the web application will use the saved model. We create a sample data point, run it through the pipeline, and decode the numeric prediction back into a human-readable label.

In [25]:
# --- Prediction on a single user input ---

# Define all inputs interactively
user_input = {
    'Followers': int(input("Enter no. of Followers: ")),
    'Following': int(input("Enter no. of Following: ")),
    'Posts': int(input("Enter no. of Posts: ")),
    'Mutual Friends': int(input("Enter no. of Mutual Friends: ")),
    'Bio': input("Does the user have a Bio? (yes/n): ").strip().lower(),
    'Profile Picture': input("Does the user have a Profile Picture? (yes/n): ").strip().lower(),
    'External Link': input("Does the user have an External Link? (yes/n): ").strip().lower(),
    'Threads': input("Does the user use Threads? (yes/n): ").strip().lower()
}


In [26]:
# 2. Preprocess the input just like the training data
#    - Create a DataFrame
#    - Calculate ratio columns
#    - Map binary columns
input_df = pd.DataFrame([user_input])
input_df['Following/Followers'] = input_df['Following'] / input_df['Followers']
input_df['Posts/Followers'] = input_df['Posts'] / input_df['Followers']

In [27]:
# The pipeline expects numeric 0/1 for binary columns, so we map them here
for col in ['Bio', 'Profile Picture', 'External Link', 'Threads']:
    input_df[col] = input_df[col].str.strip().str.lower().map({'n': 0, 'yes': 1})

In [28]:
# Ensure the column order is the same as the training data
input_df = input_df[X.columns]

In [29]:
# 3. Use the trained pipeline to predict
numeric_prediction = rf_pipeline.predict(input_df)

In [30]:
# 4. Use the label encoder to get the original text label
final_prediction = le.inverse_transform(numeric_prediction)

In [31]:
print(f"Input Data:\n{user_input}")
print("-" * 50)
print(f"Predicted Account Type: {final_prediction[0]}")

Input Data:
{'Followers': 55, 'Following': 53, 'Posts': 334, 'Mutual Friends': 0, 'Bio': 'n', 'Profile Picture': 'n', 'External Link': 'n', 'Threads': 'n'}
--------------------------------------------------
Predicted Account Type: Spam
