In [1]:
import pandas as pd
import numpy as np

In [41]:
# Sample dataset
data = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5],
    'Category': ['A', 'B', 'A', 'C', 'B'],
    'Numerical_Value': [10, np.nan, 30, 40, 50],
    'Text_Value': ['Good', 'Bad', 'Excellent', 'Good', 'Excellent']
})

In [42]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [43]:
# 1. Data Cleaning
# Handle missing values
imputer = SimpleImputer(strategy='mean')
data[['Numerical_Value']] = imputer.fit_transform(data[['Numerical_Value']])
data

Unnamed: 0,ID,Category,Numerical_Value,Text_Value
0,1,A,10.0,Good
1,2,B,32.5,Bad
2,3,A,30.0,Excellent
3,4,C,40.0,Good
4,5,B,50.0,Excellent


In [36]:
# encoder = OneHotEncoder()
# new = encoder.fit(data[['Category']])
# new.categories_

[array(['A', 'B', 'C'], dtype=object)]

In [None]:
# encoder = OneHotEncoder()

In [27]:
# # 2. Data Transformation
# # Encode categorical variables
# encoder = OneHotEncoder()
# encoded_categorical = pd.DataFrame(encoder.fit_transform(data[['Category']]).toarray(),
#                                    columns=encoder.get_feature_names(['Category']))
# data = pd.concat([data, encoded_categorical], axis=1)
# data = data.drop(['Category'], axis=1)

AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names'

In [44]:
# 3. Feature Engineering
# Extract features from text
data['Text_Length'] = data['Text_Value'].apply(len)

# Dimensionality reduction
pca = PCA(n_components=1)
data['New Feature'] = pca.fit_transform(data[['Numerical_Value', 'Text_Length']])
data

# PCA_Component - > reduce 2 feature as 1 singlw feature

Unnamed: 0,ID,Category,Numerical_Value,Text_Value,Text_Length,New Feature
0,1,A,10.0,Good,4,-22.571069
1,2,B,32.5,Bad,3,-0.247013
2,3,A,30.0,Excellent,9,-2.207952
3,4,C,40.0,Good,4,7.311964
4,5,B,50.0,Excellent,9,17.71407


In [47]:
# 4. Data Splitting
X = data.drop(['ID', 'Text_Value'], axis=1)
y = data['ID']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,Category,Numerical_Value,Text_Length,New Feature
4,B,50.0,9,17.71407
2,A,30.0,9,-2.207952
0,A,10.0,4,-22.571069
3,C,40.0,4,7.311964


In [48]:
# 5. Data Normalization
# Scale numerical features
scaler = StandardScaler()
X_train[['Numerical_Value', 'Text_Length']] = scaler.fit_transform(X_train[['Numerical_Value', 'Text_Length']])
X_train

Unnamed: 0,Category,Numerical_Value,Text_Length,New Feature
4,B,1.183216,1.0,17.71407
2,A,-0.169031,1.0,-2.207952
0,A,-1.521278,-1.0,-22.571069
3,C,0.507093,-1.0,7.311964


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [49]:
# Load the transportation dataset
transport_data = pd.read_csv('synthetic_data.csv')
transport_data.head()

Unnamed: 0,Date,Time,Stop/Station,Passenger_Count,Vehicle_ID,Latitude,Longitude,Temperature (°C),Precipitation (mm),Humidity (%),Age_Group,Gender,Feedback
0,2023-11-16,12:03,Johor Bahru,46,TRAIN82,3.906935,106.068464,11,3,63,18-24,Male,Driver was friendly
1,2023-07-14,05:07,Cameron Highlands,17,TRAIN65,4.227106,118.407191,3,3,74,25-40,Female,Seats were uncomfortable
2,2023-09-22,14:11,Ipoh,91,TRAIN38,6.819556,101.272984,27,1,81,40-60,Male,Delay in departure
3,2022-07-12,09:11,Penang,41,BUS245,3.627521,106.22699,1,7,98,25-40,Female,Driver was friendly
4,2023-12-09,16:59,Kuching,53,BUS958,1.418952,117.050925,15,9,71,40-60,Male,Service was excellent


In [50]:
# 1. Data Cleaning
# Handle missing values
imputer = SimpleImputer(strategy='mean')
transport_data[['Passenger_Count', 'Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)']] = imputer.fit_transform(transport_data[['Passenger_Count', 'Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)']])

In [51]:
transport_data = transport_data.rename(columns={'Age Group': 'Age_Group'})

In [None]:
# 2. Data Transformation
# Encode categorical variables
encoder = OneHotEncoder()
encoded_categorical = pd.DataFrame(encoder.fit_transform(transport_data[['Age_Group', 'Gender']]).toarray(), columns=encoder.get_feature_names(['Age_Group', 'Gender']))
transport_data = pd.concat([transport_data, encoded_categorical], axis=1)
transport_data = transport_data.drop(['Age_Group', 'Gender'], axis=1)



In [None]:
# 3. Feature Engineering
# Create new features
transport_data['Temperature_Humidity_Ratio'] = transport_data['Temperature (°C)'] / transport_data['Humidity (%)']

In [52]:
# Dimensionality reduction
pca = PCA(n_components=2)
transport_data_pca = pca.fit_transform(transport_data[['Passenger_Count', 'Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)']])
# transport_data['PCA_Component1'] = transport_data_pca[:, 0]
# transport_data['PCA_Component2'] = transport_data_pca[:, 1]

In [55]:
transport_data_pca[:,1]

array([ 3.08252394e+00, -8.77527779e+00,  6.64978761e+00, -2.32124999e+01,
        2.36127286e+00, -2.68974353e+01,  2.59967726e+01, -1.53441281e+01,
        2.73825328e+00,  6.82058453e+00,  1.51410517e+01, -1.62343620e+01,
       -5.52695035e+00, -1.01498800e+00,  1.33023590e+01, -2.85968051e+00,
        2.80798660e+01,  1.02302966e+00,  2.00055989e+01,  4.20918551e+00,
        1.40409408e+01, -4.38819308e+00,  9.88330516e+00, -1.28283760e+01,
       -9.82558821e+00,  7.05676668e+00, -2.46876235e+01,  7.71665469e+00,
       -7.70143705e+00,  1.01837676e+01, -2.13631813e+01, -1.30727726e+01,
       -1.33673075e+01,  1.32846958e+01, -1.67987665e+01,  2.24735299e+00,
       -2.56105380e+01,  1.28375648e+01,  1.28725223e+01, -1.82415083e+01,
       -5.85715290e-01,  2.34580700e+01,  1.05073938e+01,  1.94589622e+01,
       -3.81282920e+00, -1.72655350e+01, -1.19899866e+01,  1.46336288e+01,
        2.99736191e+00,  2.33250167e+01, -2.65942877e+01, -2.30334287e+01,
       -1.13932100e+01,  

In [None]:
# 4. Data Splitting
X = transport_data.drop(['Passenger_Count'], axis=1)
y = transport_data['Passenger_Count']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 5. Data Normalization
# Scale numerical features
scaler = StandardScaler()
X_train[['Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Temperature_Humidity_Ratio', 'PCA_Component1', 'PCA_Component2']] = scaler.fit_transform(X_train[['Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Temperature_Humidity_Ratio', 'PCA_Component1', 'PCA_Component2']])
X_test[['Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Temperature_Humidity_Ratio', 'PCA_Component1', 'PCA_Component2']] = scaler.transform(X_test[['Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Temperature_Humidity_Ratio', 'PCA_Component1', 'PCA_Component2']])

In [56]:
X_test

Unnamed: 0,Category,Numerical_Value,Text_Length,New Feature
1,B,32.5,3,-0.247013
