<a href="https://colab.research.google.com/github/benmanjackson/CS290/blob/main/Friday_Oct4thHomework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Let's practice loading and exploring some data! :)

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [16]:
df = pd.read_csv("https://github.com/benmanjackson/CS290/raw/main/weatherAUS.csv")

In [17]:
df.columns
df = df.dropna()

In [18]:
X = df.drop('RainTomorrow', axis=1)
y = df['RainTomorrow']

In [19]:
num_attributes = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed",
                  "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am",
                  "Pressure3pm", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm"]

cat_attributes = ["WindGustDir", "WindDir9am", "WindDir3pm"]

In [20]:
# Preprocessing pipelines
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [21]:
# Column transformer
col_transform = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes)
])

In [22]:
# Full pipeline
pipeline = Pipeline(steps=[
    ('pre', col_transform),
    ('clf', DecisionTreeClassifier())
])

Prep data for training/testing

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Train the DecisionTreeClassifier
pipeline.fit(X_train, y_train)

In [25]:
#Prediction:
y_pred = pipeline.predict(X_test)

In [26]:
#Evaluation:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f'DecisionTreeClassifier Accuracy: {accuracy}')

DecisionTreeClassifier Accuracy: 0.7927153491669621


# GridSearchCV

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
#Parameters:
param_grid = {
    'clf__max_depth': [3, 5, 7, 10],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

In [29]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

In [30]:
grid_search.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [31]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best Parameters: {best_params}')
print(f'Best Cross-Validation Accuracy: {best_score}')

Best Parameters: {'clf__max_depth': 7, 'clf__min_samples_leaf': 4, 'clf__min_samples_split': 5}
Best Cross-Validation Accuracy: 0.8454891908727123


In [32]:
#Evaluation for GridSearchCV:
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
best_accuracy = accuracy_score(y_test, y_pred_best)
print(f'Best Model Test Accuracy: {best_accuracy}')

Best Model Test Accuracy: 0.8451790145338532


# RandomForestClassifier

In [33]:
from sklearn.ensemble import RandomForestClassifier


In [35]:
#RandomForest Pipeline:
rf_pipeline = Pipeline(steps=[
    ('pre', col_transform),
    ('clf', RandomForestClassifier(random_state=42))
])

In [36]:
rf_pipeline.fit(X_train, y_train)

In [37]:
y_pred_rf = rf_pipeline.predict(X_test)

In [38]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'RandomForestClassifier Accuracy: {accuracy_rf}')

RandomForestClassifier Accuracy: 0.8608649415101028


**Performance Evaluation:**

In [39]:
print(f'DecisionTreeClassifier Accuracy: {accuracy}')
print(f'Best DecisionTreeClassifier Accuracy (with GridSearchCV): {best_accuracy}')
print(f'RandomForestClassifier Accuracy: {accuracy_rf}')

DecisionTreeClassifier Accuracy: 0.7927153491669621
Best DecisionTreeClassifier Accuracy (with GridSearchCV): 0.8451790145338532
RandomForestClassifier Accuracy: 0.8608649415101028
