In [1]:

# Define the base path where the files are located.
base_path = r'C:\Users\Buboy'


In [2]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


Matplotlib is building the font cache; this may take a moment.


In [3]:

# Load datasets
customer_df = pd.read_csv(f"{base_path}\\customer_demographics_contaminated.csv")
transactions_df = pd.read_csv(f"{base_path}\\customer_transactions_contaminated.csv")
social_df = pd.read_csv(f"{base_path}\\social_media_interactions_contaminated.csv")


In [4]:

# Inspect structure
print(customer_df.info())
print(transactions_df.info())
print(social_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3200 entries, 0 to 3199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CustomerID   3200 non-null   object
 1   Age          2909 non-null   object
 2   Gender       3200 non-null   object
 3   Location     3200 non-null   object
 4   IncomeLevel  2897 non-null   object
 5   SignupDate   3200 non-null   object
dtypes: object(6)
memory usage: 150.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3200 entries, 0 to 3199
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CustomerID       3200 non-null   object
 1   TransactionID    3200 non-null   object
 2   TransactionDate  3200 non-null   object
 3   Amount           2896 non-null   object
 4   ProductCategory  2901 non-null   object
 5   PaymentMethod    3200 non-null   object
dtypes: object(6)
memory usage: 150.1+ KB
None
<class

In [5]:

# Remove duplicates
customer_df.drop_duplicates(inplace=True)
transactions_df.drop_duplicates(inplace=True)
social_df.drop_duplicates(inplace=True)


In [6]:

# Check missing values
print(customer_df.isnull().sum())
print(transactions_df.isnull().sum())
print(social_df.isnull().sum())


CustomerID       0
Age            276
Gender           0
Location         0
IncomeLevel    285
SignupDate       0
dtype: int64
CustomerID           0
TransactionID        0
TransactionDate      0
Amount             283
ProductCategory    282
PaymentMethod        0
dtype: int64
CustomerID           0
InteractionID        0
InteractionDate      0
Platform           291
InteractionType      0
Sentiment          309
dtype: int64


In [7]:

# Handle missing values (MCAR + MNAR)
customer_df['Age'] = pd.to_numeric(customer_df['Age'], errors='coerce')
customer_df['Age'].fillna(customer_df['Age'].median(), inplace=True)
customer_df['IncomeLevel'].fillna(customer_df['IncomeLevel'].mode()[0], inplace=True)
customer_df['SignupDate'] = pd.to_datetime(customer_df['SignupDate'], errors='coerce')

transactions_df['Amount'] = pd.to_numeric(transactions_df['Amount'], errors='coerce')
transactions_df['Amount'].fillna(transactions_df['Amount'].median(), inplace=True)
transactions_df['ProductCategory'].fillna('Unknown', inplace=True)
transactions_df['PaymentMethod'].fillna('Unknown', inplace=True)
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'], errors='coerce')

social_df['Platform'].fillna('Unknown', inplace=True)
social_df['Sentiment'].fillna('Neutral', inplace=True)
social_df['InteractionDate'] = pd.to_datetime(social_df['InteractionDate'], errors='coerce')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_df['Age'].fillna(customer_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_df['IncomeLevel'].fillna(customer_df['IncomeLevel'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work b

In [8]:

# Encode categorical variables
le = LabelEncoder()
customer_df['Gender'] = le.fit_transform(customer_df['Gender'])
customer_df['IncomeLevel'] = le.fit_transform(customer_df['IncomeLevel'])

transactions_df['ProductCategory'] = le.fit_transform(transactions_df['ProductCategory'])
transactions_df['PaymentMethod'] = le.fit_transform(transactions_df['PaymentMethod'])

social_df['Platform'] = le.fit_transform(social_df['Platform'])
social_df['Sentiment'] = le.fit_transform(social_df['Sentiment'])


In [9]:

# Normalize numerical columns
scaler = MinMaxScaler()
customer_df['Age'] = scaler.fit_transform(customer_df[['Age']])
transactions_df['Amount'] = scaler.fit_transform(transactions_df[['Amount']])


In [10]:

# Save cleaned datasets
customer_df.to_csv(f"{base_path}\\customer_demographics_cleaned.csv", index=False)
transactions_df.to_csv(f"{base_path}\\customer_transactions_cleaned.csv", index=False)
social_df.to_csv(f"{base_path}\\social_media_interactions_cleaned.csv", index=False)
