In [2]:
import pandas as pd
import os

### This script merges multiple CSV files from a specified folder into a single CSV file.

In [None]:
folderPath = r"D:/JantaKoAwaj-FYP/jka-ml-model/data"
outputFolder = r"D:/JantaKoAwaj-FYP/jka-ml-model/dataset"
outputFilePath = os.path.join(outputFolder, 'final_complaints_dataset.csv')

# checking if the folder exists
os.makedirs(outputFolder, exist_ok=True)

allData = []
for filename in os.listdir(folderPath):
    if filename.endswith('.csv'):
        file_path = os.path.join(folderPath, filename)
        
        # Read CSV file
        df = pd.read_csv(file_path)
        
        # Extract category from filename
        # Assuming the filename format is 'category_name.csv'
        # Replace underscores with spaces and capitalize words
        category = filename.replace('.csv', '').replace('_', ' ').title()
        
        # Add category column
        df['Category'] = category
        
        # Add default label as 'genuine'
        df['Label'] = 'genuine'
        
        # Append to list
        allData.append(df)

# Combine all DataFrames and save
if allData:
    merged_df = pd.concat(allData, ignore_index=True)
    merged_df.to_csv(outputFilePath, index=False, encoding='utf-8-sig')
    print(f"Merged successfully into {outputFilePath}")
else:
    print("No data to merge. Please check your input files.")


Merged successfully into D:/JantaKoAwaj-FYP/jka-ml-model/dataset\final_complaints_dataset.csv


#### merging genuine and not genuine (spam) complaints into a single file

In [3]:
outputFolder = r"D:/JantaKoAwaj-FYP/jka-ml-model/dataset"
outputFilePath = os.path.join(outputFolder, "final_balanced_dataset.csv")
# reading both datasets
df_genuine = pd.read_csv(r'D:/JantaKoAwaj-FYP/jka-ml-model/dataset/normalized_dataset.csv', encoding='utf-8-sig')
df_spam = pd.read_csv(r'D:/JantaKoAwaj-FYP/jka-ml-model/data/spam_complaints.csv', encoding='utf-8-sig')

# sampling random 5000 samples from the genuine dataset
df_genuine_sampled = df_genuine.sample(n=5000, random_state=42)
# print(f"Sampled {len(df_genuine_sampled)} genuine complaints.")

# droping SNo.
df_genuine_sampled = df_genuine_sampled.drop(columns=['SNo'], errors='ignore')
df_spam = df_spam.drop(columns=['SNo'], errors='ignore')

#merging both data into single final dataset
final_df = pd.concat([df_genuine_sampled, df_spam], ignore_index=True)

# shuffling 
final_df = final_df.sample(frac = 1, random_state=42).reset_index(drop=True)

#add new Sno.
final_df.insert(0, "SNo", range(1, len(final_df) +1))


# saving the final dataset
final_df.to_csv(outputFilePath, index=False, encoding="utf-8-sig")

print("Balanced dataset after merging spam and genuine complaints is save successfully!")


Balanced dataset after merging spam and genuine complaints is save successfully!
