combine both trainning and testing data sets

In [124]:
import pandas as pd

# Read the CSV files
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

# Add a flag column to distinguish between training and testing data
train_data['is_train'] = 1
test_data['is_train'] = 0

# Combine the datasets
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Save the combined dataset to a new CSV file (optional)
combined_data.to_csv('combined_data.csv', index=False)

# Display the first few rows of the combined dataset
combined_data


Unnamed: 0,Timestamp,Age,Sex,Relationship Status,Occupation,Social Media User?,Platforms Used,Time Spent,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Target,Difficulty_level,is_train
0,4/19/2022 21:55:45,24,0,Single,Salaried Worker,Yes,"Facebook, Twitter, Instagram, YouTube",2,3,3,2,2,4,3,2,4,2,low,1
1,4/22/2022 2:35:48,46,0,Married,Salaried Worker,Yes,"Facebook, YouTube",0,2,1,1,1,1,1,1,2,2,low,1
2,5/21/2022 22:19:10,30,1,Married,Salaried Worker,Yes,"Facebook, Instagram, YouTube",4,2,4,4,2,4,4,4,2,3,medium,1
3,4/18/2022 21:48:07,56,1,Married,Retired,Yes,YouTube,1,1,1,1,1,3,2,1,3,2,low,1
4,5/12/2022 0:16:23,19,0,Single,School Student,Yes,"Facebook, Instagram, YouTube, TikTok",4,3,3,4,3,3,3,4,3,3,medium,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,4/21/2022 20:47:50,21,1,Single,University Student,Yes,"Facebook, YouTube",1,4,3,1,3,4,3,5,2,3,medium,0
476,4/26/2022 22:13:41,22,1,Single,University Student,Yes,"Facebook, Instagram, YouTube, Snapchat, Pinter...",2,5,5,3,2,5,5,3,1,3,medium,0
477,5/12/2022 19:15:46,24,1,In a relationship,University Student,Yes,"Facebook, Twitter, Instagram, YouTube, Reddit",5,5,2,4,4,5,4,3,2,2,low,0
478,4/18/2022 20:38:00,35,1,Married,Salaried Worker,Yes,"Facebook, YouTube",3,4,4,3,4,3,4,4,4,3,medium,0


In [125]:
# Define the social media platforms
social_media_platforms = ["Facebook", "Instagram", "YouTube", "Snapchat", "Twitter", "Discord", "Reddit", "Pinterest", "TikTok"]

# Function to transform social media platforms
def transform_social_media(platforms):
    result = {platform: 0 for platform in social_media_platforms}
    for platform in platforms.split(', '):
        if platform in result:
            result[platform] = 1
    return result

# Apply transformation to social media platforms column
social_media_transformed = combined_data["Platforms Used"].apply(transform_social_media)
social_media_df = pd.DataFrame(list(social_media_transformed))


# Concatenate the transformed data with the original dataset
combined_data = pd.concat([combined_data, social_media_df], axis=1)

# Drop the original columns that were transformed
combined_data.drop(columns=["Platforms Used"], inplace=True)



In [126]:
combined_data.head()

Unnamed: 0,Timestamp,Age,Sex,Relationship Status,Occupation,Social Media User?,Time Spent,Q1,Q2,Q3,...,is_train,Facebook,Instagram,YouTube,Snapchat,Twitter,Discord,Reddit,Pinterest,TikTok
0,4/19/2022 21:55:45,24,0,Single,Salaried Worker,Yes,2,3,3,2,...,1,1,1,1,0,1,0,0,0,0
1,4/22/2022 2:35:48,46,0,Married,Salaried Worker,Yes,0,2,1,1,...,1,1,0,1,0,0,0,0,0,0
2,5/21/2022 22:19:10,30,1,Married,Salaried Worker,Yes,4,2,4,4,...,1,1,1,1,0,0,0,0,0,0
3,4/18/2022 21:48:07,56,1,Married,Retired,Yes,1,1,1,1,...,1,0,0,1,0,0,0,0,0,0
4,5/12/2022 0:16:23,19,0,Single,School Student,Yes,4,3,3,4,...,1,1,1,1,0,0,0,0,0,1


In [127]:
combined_data['Difficulty_level']

0         low
1         low
2      medium
3         low
4      medium
        ...  
475    medium
476    medium
477       low
478    medium
479    medium
Name: Difficulty_level, Length: 480, dtype: object

In [128]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Convert categorical variables to numeric using LabelEncoder
label_encoder = LabelEncoder()
combined_data['Sex'] = label_encoder.fit_transform(combined_data['Sex'])
combined_data['Relationship Status'] = label_encoder.fit_transform(combined_data['Relationship Status'])
combined_data['Occupation'] = label_encoder.fit_transform(combined_data['Occupation'])
combined_data['Social Media User?'] = label_encoder.fit_transform(combined_data['Social Media User?'])

# Map Difficulty_level to numerical categories (0, 1, 2)
difficulty_mapping = {'low': 0, 'medium': 1, 'high': 2}
combined_data['Difficulty_level'] = combined_data['Difficulty_level'].map(difficulty_mapping)

combined_data['Difficulty_level']

# # Split dataset into features and target
X_combined = combined_data.drop(['Timestamp', 'Difficulty_level'], axis=1)
y_combined = combined_data['Difficulty_level']

y_combined.head()

0    0
1    0
2    1
3    0
4    1
Name: Difficulty_level, dtype: int64

In [132]:
X_combined.head()

Unnamed: 0,Age,Sex,Relationship Status,Occupation,Social Media User?,Time Spent,Q1,Q2,Q3,Q4,...,is_train,Facebook,Instagram,YouTube,Snapchat,Twitter,Discord,Reddit,Pinterest,TikTok
0,24,0,3,1,1,2,3,3,2,2,...,1,1,1,1,0,1,0,0,0,0
1,46,0,2,1,1,0,2,1,1,1,...,1,1,0,1,0,0,0,0,0,0
2,30,1,2,1,1,4,2,4,4,2,...,1,1,1,1,0,0,0,0,0,0
3,56,1,2,0,1,1,1,1,1,1,...,1,0,0,1,0,0,0,0,0,0
4,19,0,3,2,1,4,3,3,4,3,...,1,1,1,1,0,0,0,0,0,1


In [133]:
# Separate the training and testing data
train_indices = combined_data['is_train'] == 1
X_train_combined = X_combined[train_indices].drop(columns=['is_train'])
y_train_combined = y_combined[train_indices]

X_train_combined.head()

balanced_class_distribution = y_train_combined.value_counts()
print("Class distribution after balancing:")
print(balanced_class_distribution)

Class distribution after balancing:
Difficulty_level
1    225
0    104
2     55
Name: count, dtype: int64


In [134]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_combined, y_train_combined)

In [135]:
balanced_class_distribution = y_train_resampled.value_counts()
print("Class distribution after balancing:")
print(balanced_class_distribution)

Class distribution after balancing:
Difficulty_level
0    225
1    225
2    225
Name: count, dtype: int64
