## Camilla's Exploratory Graphs and XGBoost Model

In [30]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
mergdf = pd.read_csv("merged3.csv")

In [5]:
mergdf.head(2)

Unnamed: 0,date,count,name,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,...,description,icon,stations,full_moon,Holiday,Day of Week,CHIC917URN,day_of_week,year,month
0,2010-01-01,13.0,"Chicago,United States",-8.3,-14.5,-11.7,-15.0,-22.4,-19.0,-16.1,...,Partly cloudy throughout the day.,partly-cloudy-day,"72534014819,KORD,KMDW,72530094846,74466504838,...",,New Year's Day,Friday,12.2,4,2010,1
1,2010-01-02,4.0,"Chicago,United States",-11.4,-16.4,-14.1,-18.4,-25.3,-22.6,-19.6,...,Partly cloudy throughout the day.,partly-cloudy-day,"72534014819,KORD,KMDW,72530094846,74466504838,...",,,,12.2,5,2010,1


In [10]:
X = mergdf[['tempmax',
            'tempmin',
            'temp',
            'feelslikemax',
            'feelslikemin',
            'feelslike',
            'humidity',
            'precip',
            'precipprob',
            'precipcover',
            'snow',
            'snowdepth',
            'windgust',
            'windspeed',
            'winddir', 
            'sealevelpressure',
            'cloudcover',
            'visibility',
            'solarradiation',
            'solarenergy',
            'uvindex',
            'severerisk', 
            'moonphase',
            'day_of_week',
            'year',
            'month',
            'preciptype',
            'sunrise',
            'sunset',
            'conditions',
            'full_moon', 
            'Holiday',
            'CHIC917URN']]

y = list(mergdf['count'])

In [37]:
#num_features = mergdf[['tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin', 'feelslike', 'humidity',
#                        'precip', 'precipprob', 'precipcover', 'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir', 
#                        'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation', 'solarenergy', 'uvindex', 'severerisk', 
#                        'moonphase', 'day_of_week', 'year', 'month', 'CHIC917URN']]

#cat_features = mergdf[['preciptype', 'conditions', 'full_moon', 'Holiday']]

cat_features = X.select_dtypes(include="object").columns
num_features = X.select_dtypes(exclude="object").columns

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer()),
           ('poly', PolynomialFeatures(degree=1, include_bias=False)),
           ("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")),
           ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
           ("selector", SelectPercentile(f_regression, percentile=50))])

In [38]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)])

pipe = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("model", XGBRegressor())])

In [39]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



xgb_pipe = pipe.fit(X_train, y_train)


In [40]:
preds = xgb_pipe.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, preds)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, preds)
print("Mean Absolute Error:", mae)


Mean Squared Error: 4.5164189565022355
Mean Absolute Error: 1.684635028746969


In [29]:
np.mean(mergdf['count'])

3.9454861782331334

In [41]:
import matplotlib.pyplot as plt

# Get feature importance scores from the trained XGBoost model
feature_importance = xgb_pipe.named_steps['model'].feature_importances_

# Create a DataFrame with feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

# Sort features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()

ValueError: All arrays must be of the same length

In [45]:
# Get feature importance scores from the trained XGBoost model
feature_importance = xgb_pipe.named_steps['model'].feature_importances_

# Create a DataFrame with feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

# Sort features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

feature_importance_df

ValueError: All arrays must be of the same length

In [46]:
from sklearn.inspection import permutation_importance

# Compute permutation feature importance
perm_importance = permutation_importance(xgb_pipe, X_test, y_test, n_repeats=10, random_state=42)

# Get feature names
feature_names = X_test.columns

# Calculate mean importance and sort by descending order
mean_importance = perm_importance['importances_mean']
sorted_indices = mean_importance.argsort()[::-1]

# Print feature importance
for idx in sorted_indices:
    print(f"{feature_names[idx]}: {mean_importance[idx]}")


year: 0.03809408807207769
feelslikemin: 0.03050054540186331
tempmin: 0.02912704351002332
month: 0.02727346423202409
solarradiation: 0.02632996890147137
solarenergy: 0.019375020583889534
precip: 0.011534872943480346
tempmax: 0.010709759130245878
CHIC917URN: 0.00974769134110769
day_of_week: 0.0075520290450932556
windgust: 0.006898744965198433
sealevelpressure: 0.003638282005031146
precipcover: 0.003350985000806983
conditions: 0.003231655202908479
Holiday: 0.002453605348519372
snowdepth: 0.001987280095064048
winddir: 0.0017322281957616648
severerisk: 0.001487875128237992
humidity: 0.0006711123646561679
precipprob: 0.0
sunrise: 0.0
sunset: 0.0
full_moon: 0.0
preciptype: 0.0
windspeed: -0.00015814006860195295
uvindex: -0.000252198396502179
visibility: -0.00039631303069030953
temp: -0.0005885853457081369
cloudcover: -0.001206906465935209
feelslike: -0.001516175281833909
snow: -0.0017567517255823173
moonphase: -0.005235031463223583
feelslikemax: -0.0068264603149829515


A higher number for permutation importance indicates that the feature is more important for the model's predictive performance. Therefore, you would typically want to keep features with higher permutation importance values, as they contribute more to the model's accuracy or predictive power.