In [21]:
import os
import pandas as pd

df = pd.read_csv('processed_instagram_data.csv')

In [22]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler

# Load the data
df = pd.read_csv('minor_college/final_csv.csv')  # Replace with your CSV file

# Step 1: Handle Missing Values
# Fill missing values in bio and full_name with empty strings
df['Bio'] = df['Bio'].fillna('')
df['Full Name'] = df['Full Name'].fillna('')
df = df.dropna(subset=['Followers', 'Following', 'Username'])  # Drop rows where essential data is missing

# Step 2: Remove Outliers using IQR Method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]

df = remove_outliers(df, 'Followers')
df = remove_outliers(df, 'Following')

# Step 3: Create New Features

# (a) Username length
df['username_length'] = df['Username'].apply(len)

# (b) Number of digits in username
df['username_digits'] = df['Username'].apply(lambda x: sum(c.isdigit() for c in x))

# (c) Special characters in username
df['username_special_chars'] = df['Username'].apply(lambda x: len(re.findall(r'\W', x)))

# (d) Full name vs. username similarity
def name_username_similarity(row):
    full_name = row['Full Name'].lower().replace(' ', '')
    username = row['Username'].lower()
    return int(full_name in username)

df['name_username_similarity'] = df.apply(name_username_similarity, axis=1)

# (e) Bio length
df['bio_length'] = df['Bio'].apply(len)

# (f) Suspicious keywords in bio
suspicious_keywords = ["money", "giveaway", "free", "bitcoin", "forex", "investment"]
df['bio_suspicious'] = df['Bio'].apply(lambda bio: 1 if any(word in bio.lower() for word in suspicious_keywords) else 0)

# (g) Follower/Following ratio
df['follower_following_ratio'] = df['Followers'] / np.where(df['Following'] == 0, 1, df['Following'])

# Step 4: Normalize/Standardize numerical features
scaler = StandardScaler()

# Select columns to normalize/standardize
numerical_features = ['Followers', 'Following', 'username_length', 'bio_length', 'follower_following_ratio', 'Number of Posts']

df[numerical_features] = scaler.fit_transform(df[numerical_features])


print("Data cleaning and processing complete. Saved to 'processed_instagram_data.csv'.")


Data cleaning and processing complete. Saved to 'processed_instagram_data.csv'.


In [23]:
# List of columns to drop because they won't be used for training
columns_to_drop = ['Username', 'Full Name', 'Bio','Username Length', 'Username Digits', 'Username Special Chars', 'Full Name Length', 'Username Similarity', 'Full Name Words', 'Has Profile Pic', 'Profile Pic Path', 'Bio Length', 'Bio Words', 'Bio Has Suspicious','External URL', 'Has External URL', 'Is Private']  # Adjust as necessary

# Drop the columns
df = df.drop(columns=columns_to_drop)

# Drop rows where 'followers', 'following', or any other critical feature is missing
df = df.dropna(subset=['Followers', 'Following'])

# Optionally, reset the index after dropping rows
df.reset_index(drop=True, inplace=True)

print(f"Cleaned data shape: {df.shape}")


Cleaned data shape: (1021, 11)


In [24]:

# Step 5: Save the cleaned and processed data
df.to_csv('processed_instagram_data.csv', index=False)

NameError: name 'y' is not defined

In [11]:
import os
import pandas as pd

# Folder where your CSV files are located
folder_path = 'C:/Users/shriv/Downloads/Minor_Project/DataCollection/data'

# List to hold all dataframes
dataframes = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV file into a dataframe
        df = pd.read_csv(file_path)
        
        # Append the dataframe to the list
        dataframes.append(df)

# Concatenate all dataframes into a single dataframe
merged_df = pd.concat(dataframes, ignore_index=True)

# Save the merged dataframe to a new CSV file
merged_df.to_csv('mergedtext_file.csv', index=False)

print("CSV files merged successfully!")

CSV files merged successfully!
