DATA CLEANING:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Define custom colors
custom_colors_before = [
    (89/255, 179/255, 200/255),  # RGB(89, 179, 200)
    (34/255, 80/255, 122/255),  # RGB(34, 80, 122)
    (43/255, 98/255, 138/255),  # RGB(43, 98, 138)
    (53/255, 119/255, 156/255),  # RGB(53, 119, 156)
    (75/255, 158/255, 184/255),  # RGB(75, 158, 184)
    (16/255, 19/255, 45/255),    # RGB(16, 19, 45)
]

custom_colors_after = [
    (10/255, 165/255, 184/255),  # Hero Image: RGB(10, 165, 184)
    (7/255, 60/255, 108/255),    # RGB(7, 60, 108)
    (17/255, 177/255, 189/255),  # RGB(17, 177, 189)
    (16/255, 78/255, 123/255),   # RGB(16, 78, 123)
    (7/255, 51/255, 102/255),    # RGB(7, 51, 102)
    (17/255, 93/255, 136/255),   # RGB(17, 93, 136)
]

# Load your dataset
original_data = pd.read_csv("/content/sample_data/Selected_variables.csv")

# Fill missing values with the mean
cleaned_data = original_data.fillna(original_data.mean())

# Create subplots for different types of plots
fig, axes = plt.subplots(3, 2, figsize=(15, 15))


# Plot 3: Line Graph Before Cleaning
sns.lineplot(data=original_data, palette=custom_colors_before, ax=axes[1, 0])
axes[1, 0].set_title('Line Graph Before Cleaning')

# Plot 4: Line Graph After Cleaning
sns.lineplot(data=cleaned_data, palette=custom_colors_after, ax=axes[1, 1])
axes[1, 1].set_title('Line Graph After Cleaning')



# Remove axis labels
for ax in axes.flat:
    ax.set(xlabel='', ylabel='')

# Add legend to line graphs
axes[1, 0].legend(loc='best')
axes[1, 1].legend(loc='best')

plt.tight_layout()
plt.show()

# Save the cleaned dataset to a new file (e.g., 'cleaned_data.csv')
cleaned_data.to_csv('cleaned_data.csv', index=False)


Note: The dependent variable ‘PUB_TRANS_SPENDINGS’ outputs 0 or 1:
If the value is ‘0’ then it prints ‘0’ as output.
If the value is greater than ‘0’ then it prints ‘1’ as output.


In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv("/content/sample_data/cleaned_data.csv")

# Update only the 'PUBTRAPQ' column
df['PUB_TRANS_SPENDING'] = df['PUB_TRANS_SPENDING'].apply(lambda x: 0 if x == 0 else 1)

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_file.csv', index=False)


Cleaning STATE and URBAN

In [None]:
import pandas as pd

# Upload your data
# Replace 'your_data.csv' with the actual file path or URL of your dataset
df = pd.read_csv("/content/sample_data/ADDED_STATE_URBAN.csv")

# Remove rows with empty spaces
df = df.replace('', pd.NA).dropna()

# Print the updated dataset
print(df)

# Optionally, save the updated dataset to a new CSV file
# Replace 'updated_data.csv' with the desired file name
df.to_csv('updated_data_STATE_URBAN.csv', index=False)



FEATURE ENGINEERING:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load your dataset (replace 'data.csv' with your data file)
data = pd.read_csv("/content/sample_data/updated_data_STATE_URBAN.csv")

# Define custom colors
custom_colors = [
    (89/255, 179/255, 200/255),  # RGB(89, 179, 200)
    (34/255, 80/255, 122/255),  # RGB(34, 80, 122)
    (43/255, 98/255, 138/255),  # RGB(43, 98, 138)
    (53/255, 119/255, 156/255),  # RGB(53, 119, 156)
    (75/255, 158/255, 184/255),  # RGB(75, 158, 184)
    (16/255, 19/255, 45/255),    # RGB(16, 19, 45)
]

# Feature Engineering 1: Binning (Car Ownership Category)
data['Car_Ownership_Category'] = pd.cut(data['NUM_AUTO'], bins=[-1, 0, 1, 2, float('inf')], labels=['0 Cars', '1 Car', '2 Cars', '3+ Cars'])

# Feature Engineering 2: Grouping (Mean INC_RANK by State)
state_grouping = data.groupby('STATE')['INC_RANK'].mean().sort_values(ascending=False)

# Feature Engineering 3: Binary Encoding (Urban_Encoded)
data['Urban_Encoded'] = data['URBAN'].map({1: 1, 2: 0})

# Visualize the feature engineering results
plt.figure(figsize=(15, 5))

# Plot 1: Bar Chart (Car Ownership Category)
plt.subplot(131)
data['Car_Ownership_Category'].value_counts().plot(kind='bar', title='Car Ownership Category', color=custom_colors[0])

# Plot 2: Line Graph (Mean INC_RANK by State)
plt.subplot(132)
state_grouping.plot(kind='line', marker='o', title='Mean INC_RANK by State', color=custom_colors[1])

# Plot 3: Bar Chart (Urban Encoding)
if 'Urban_Encoded' in data.columns and not data['Urban_Encoded'].empty:
    plt.subplot(133)
    data['Urban_Encoded'].value_counts().plot(kind='bar', title='Urban Encoding', color=custom_colors[2])

plt.tight_layout()
plt.show()

# Save the updated DataFrame to a CSV file
data.to_csv('updated_features_selected.csv', index=False)


Note:
In car ownership category the 0 cars, 1 car, 2 cars, 3+ cars are considered as 0, 1, 2 and 3 cars.


In [None]:
import pandas as pd

# Load your dataset
data = pd.read_csv("/content/sample_data/updated_features_selected.csv")

# Create a mapping dictionary for Car_Ownership_Category
car_ownership_mapping = {'0 Cars': 0, '1 Car': 1, '2 Cars': 2, '3 Cars': 3}

# Replace values in the Car_Ownership_Category column
data['Car_Ownership_Category'] = data['Car_Ownership_Category'].map(car_ownership_mapping)

# Save the modified data to a new CSV file
data.to_csv('your_modifiedfeature_data.csv', index=False)


MODELING:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

# Load your dataset (replace 'data.csv' with your data file)
data = pd.read_csv("/content/sample_data/updated_feature_dataset_____.csv")

# Assume your target variable is 'PUBTRAPQ'
target_column_name = 'PUB_TRANS_SPENDING'

# Define features (X) and target variable (y)
X = data.drop(columns=[target_column_name])
y = data[target_column_name]

# Handle missing values (replace NaN with the mean)
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate Random Forest Classifier
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

print("Random Forest Classifier Results:")
print(f"Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)

# Generate Classification Report
classification_rep_rf = classification_report(y_test, y_pred_rf)
print("\nClassification Report:")
print(classification_rep_rf)

# Feature Importances
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf_classifier.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print("\nFeature Importances:")
print(feature_importances)

# Define custom colors for the scatter plot
custom_colors = [
    (89/255, 179/255, 200/255),  # RGB(89, 179, 200)
    (34/255, 80/255, 122/255),  # RGB(34, 80, 122)
    (43/255, 98/255, 138/255),  # RGB(43, 98, 138)
    (53/255, 119/255, 156/255),  # RGB(53, 119, 156)
    (75/255, 158/255, 184/255),  # RGB(75, 158, 184)
    (16/255, 19/255, 45/255),    # RGB(16, 19, 45)
]

# Create a scatter plot for overall accuracy
plt.figure(figsize=(8, 6))
plt.scatter([0], [accuracy_rf], color=custom_colors[0], marker='o', s=100)
plt.title('Overall Accuracy')
plt.xlabel('Accuracy')
plt.ylabel('Value')

plt.grid(True, linestyle='--', alpha=0.6)
plt.ylim(0, 1)

plt.show()
