In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'crimes-in-chicago:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F740%2F1375%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240929%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240929T184413Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da5b84f0d36ea5161ed791e739e5fc6ec50d3a683d723e577025bb48558a63c094dc5f3134e15daaad5c79771263a27c279089f2c2069c7ce26b1bf377bfa82a364de13823bef06d7f7ff8f13494f38e84bf5781626bc5aaee8c7fda5e6f8ea8179bbcc14265f22bfdaa44acfcfe55e8c9827fdc530c52c280fadc24e816eb21d225e0cfc21032c34e0056fed350b2fd0dc4a11f4242d8724e020518f548bf3269e932684f77e3720d37bc02cec3dad379fe5af9dea21b6d855e66cae2b791def17bc291ce24534936e5a74878620035a5145802d00eba9f8619754c81d5f6b3e19d4f5abb73f02290a239599ea5060d0203f483f6ed2da8b7ce6bb1196c97650'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os # accessing directory structure
plt.style.use('seaborn')
pd.set_option('display.max_columns', None)

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd

# File paths
file_paths = [
    '/kaggle/input/crimes-in-chicago/Chicago_Crimes_2008_to_2011.csv',
    '/kaggle/input/crimes-in-chicago/Chicago_Crimes_2001_to_2004.csv',
    '/kaggle/input/crimes-in-chicago/Chicago_Crimes_2012_to_2017.csv',
    '/kaggle/input/crimes-in-chicago/Chicago_Crimes_2005_to_2007.csv'
]

# Read and concatenate the CSV files into a single DataFrame, handling bad lines
df_list = [pd.read_csv(file, on_bad_lines='skip') for file in file_paths]
combined_df = pd.concat(df_list, ignore_index=True)

# Display the shape of the combined DataFrame
print(combined_df.shape)

# Optionally, save the combined DataFrame to a new CSV file
# combined_df.to_csv('combined_chicago_crimes.csv', index=False)


In [None]:
# crimes = pd.read_csv('/content/drive/MyDrive/bda/chicago_crime_dataset_bda.csv')

In [None]:
crimes = combined_df

## <a name="p4">Basic Description</a>

**ID** - Unique identifier for the record.

**Case Number** - The Chicago Police Department RD Number (Records Division Number), which is unique to the incident.

**Date** - Date when the incident occurred. this is sometimes a best estimate.

**Block** - The partially redacted address where the incident occurred, placing it on the same block as the actual address.

**IUCR** - The Illinois Unifrom Crime Reporting code. This is directly linked to the Primary Type and Description. See the list of IUCR codes at https://data.cityofchicago.org/d/c7ck-438e.

**Primary Type** - The primary description of the IUCR code.

**Description** - The secondary description of the IUCR code, a subcategory of the primary description.

**Location Description** - Description of the location where the incident occurred.

**Arrest** - Indicates whether an arrest was made.

**Domestic** - Indicates whether the incident was domestic-related as defined by the Illinois Domestic Violence Act.

**Beat** - Indicates the beat where the incident occurred. A beat is the smallest police geographic area – each beat has a dedicated police beat car. Three to five beats make up a police sector, and three sectors make up a police district. The Chicago Police Department has 22 police districts. See the beats at https://data.cityofchicago.org/d/aerh-rz74.

**District** - Indicates the police district where the incident occurred. See the districts at https://data.cityofchicago.org/d/fthy-xz3r.

**Ward** - The ward (City Council district) where the incident occurred. See the wards at https://data.cityofchicago.org/d/sp34-6z76.

**Community Area** - Indicates the community area where the incident occurred. Chicago has 77 community areas. See the community areas at https://data.cityofchicago.org/d/cauq-8yn6.

**FBI Code** - Indicates the crime classification as outlined in the FBI's National Incident-Based Reporting System (NIBRS). See the Chicago Police Department listing of these classifications at http://gis.chicagopolice.org/clearmap_crime_sums/crime_types.html.

**X Coordinate** - The x coordinate of the location where the incident occurred in State Plane Illinois East NAD 1983 projection. This location is shifted from the actual location for partial redaction but falls on the same block.

**Y Coordinate** - The y coordinate of the location where the incident occurred in State Plane Illinois East NAD 1983 projection. This location is shifted from the actual location for partial redaction but falls on the same block.

**Year** - Year the incident occurred.

**Updated On** - Date and time the record was last updated.

**Latitude** - The latitude of the location where the incident occurred. This location is shifted from the actual location for partial redaction but falls on the same block.

**Longitude** - The longitude of the location where the incident occurred. This location is shifted from the actual location for partial redaction but falls on the same block.

**Location** - The location where the incident occurred in a format that allows for creation of maps and other geographic operations on this data portal. This location is shifted from the actual location for partial redaction but falls on the same block.

In [None]:
crimes.shape

In [None]:
crimes.info()

In [None]:
def summary(df):
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values
    summ['Duplicate'] = df.duplicated().sum()
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['avg'] = desc['mean'].values
    summ['std dev'] = desc['std'].values
    summ['top value'] = desc['top'].values
    summ['Freq'] = desc['freq'].values

    return summ

summary(crimes.drop(columns=["ID"])).style.background_gradient()

In [None]:
# Convert dates to pandas datetime format
crimes['Date'] = pd.to_datetime(crimes['Date'], format='%m/%d/%Y %I:%M:%S %p')

# Setting the index to be the date will help us later
crimes.index = pd.DatetimeIndex(crimes['Date'])


## <a name="p4">Filtering out irrelevant features from the dataset.</a>

In [None]:
crimes.drop(['Case Number', 'IUCR','Updated On','Year', 'FBI Code', 'Beat','Ward','Community Area', 'Location'], inplace=True, axis=1)


## <a name="p4">Dropping missing/null values as they are accounting for <1% of data.</a>

In [None]:
crimes = crimes.dropna()

In [None]:
crimes.shape

In [None]:
crimes['Primary Type'].value_counts()

In [None]:
crimes

## <a name="p4">Reducing/merging number of crime types from 32 to 20.</a>

In [None]:
# Removing these from primary type
crimes = crimes[(crimes['Primary Type'] != 'NON-CRIMINAL (SUBJECT SPECIFIED)') &
                (crimes['Primary Type'] != 'NON-CRIMINAL') &
                (crimes['Primary Type'] != 'NON - CRIMINAL') &
                (crimes['Primary Type'] != 'CONCEALED CARRY LICENSE VIOLATION') &
                (crimes['Primary Type'] != 'DOMESTIC VIOLENCE') &
                (crimes['Primary Type'] != 'PUBLIC INDECENCY') &
                (crimes['Primary Type'] != 'OBSCENITY') &
                (crimes['Primary Type'] != 'RITUALISM')]

# Add this type, it is rare. --Haifeng

In [None]:
def combinePrimaryTypes(targetDf):
    '''
    This function helps combine PRIMARY CRIME TYPES
    '''
    targetDf.loc[targetDf['Primary Type'] == 'OTHER NARCOTIC VIOLATION', 'Primary Type'] = 'NARCOTICS'
    targetDf.loc[targetDf['Primary Type'].isin(['PROSTITUTION', 'CRIM SEXUAL ASSAULT']), 'Primary Type'] = 'SEX OFFENSE'
    targetDf.loc[targetDf['Primary Type'].isin(['LIQUOR LAW VIOLATION', 'RITUALISM', 'GAMBLING']), 'Primary Type'] = 'OTHER OFFENSE'
    targetDf.loc[targetDf['Primary Type'].isin(['CRIMINAL TRESPASS', 'ROBBERY']), 'Primary Type'] = 'ROBBERY'
    targetDf.loc[targetDf['Primary Type'] == 'INTERFERENCE WITH PUBLIC OFFICER', 'Primary Type'] = 'PUBLIC PEACE VIOLATION'
    targetDf.loc[targetDf['Primary Type'].isin(['INTIMIDATION', 'STALKING']), 'Primary Type'] = 'INTIMIDATION or STALKING'

    return targetDf


In [None]:
crimes = combinePrimaryTypes(crimes)

In [None]:
crimes['Primary Type'].value_counts()

## <a name="p4">Truncating data to 1.25%</a>

In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'target_column' is the column you want to maintain the composition of
X = crimes.drop(columns=['Primary Type'])  # Features
y = crimes['Primary Type']  # Target variable

# Split the data into train and test sets (sampled_crimes will be the test set)
_, crimes = train_test_split(crimes, test_size=0.0125, stratify=y, random_state=42)

# Display the sampled DataFrame
print(crimes)


In [None]:
loc_to_change  = list(crimes['Location Description'].value_counts()[20:].index)
desc_to_change = list(crimes['Description'].value_counts()[20:].index)
#type_to_change = list(crimes['Primary Type'].value_counts()[20:].index)

crimes.loc[crimes['Location Description'].isin(loc_to_change) , crimes.columns=='Location Description'] = 'OTHER'
crimes.loc[crimes['Description'].isin(desc_to_change) , crimes.columns=='Description'] = 'OTHER'
#crimes.loc[crimes['Primary Type'].isin(type_to_change) , crimes.columns=='Primary Type'] = 'OTHER'

In [None]:
# we convert those 3 columns into 'Categorical' types -- works like 'factor' in R
crimes['Primary Type']         = pd.Categorical(crimes['Primary Type'])
crimes['Location Description'] = pd.Categorical(crimes['Location Description'])
crimes['Description']          = pd.Categorical(crimes['Description'])

## <a name="p4">Exploratory data analysis (EDA)</a>

# The EDA on the Chicago Crime dataset aims to uncover the distribution of crime types, identify hotspots for different crimes, analyze trends over time, assess arrest rates, and map crime density geospatially. This analysis will provide insights into crime patterns, locations, trends, law enforcement efforts, and spatial distribution within Chicago for informed decision-making and strategies.

In [None]:
crime_types = crimes['Primary Type'].value_counts()
# Calculate the total count
total_count = crime_types.sum()

# Calculate the percentage for each crime type
crime_types_pct = crime_types / total_count * 100

# Create a new Series with crime types above 1.5% threshold
major_crime_types = crime_types_pct[crime_types_pct >= 1.5]

# Group the remaining crime types as 'Others'
others = crime_types_pct[crime_types_pct < 1.5].sum()
if others > 0:
    major_crime_types['Others'] = others

# Create the pie chart
fig, ax = plt.subplots(figsize=(10, 8))
ax.pie(major_crime_types.values, labels=major_crime_types.index, autopct='%1.1f%%')
ax.axis('equal')  # Ensure the pie chart is circular
ax.set_title('Crime Types')

plt.show()

#We can see that theft has occurred most frequently as it makes up of 21% of crime. Theft, battery, criminal damage, and narcotics are the most frequent occur crimes in Chicago which make up of 65.7% out of all the crimes.

In [None]:
plt.figure(figsize=(8,10))
crimes.groupby([crimes['Primary Type']]).size().sort_values(ascending=True).plot(kind='barh')
plt.title('Number of crimes by type')
plt.ylabel('Crime Type')
plt.xlabel('Number of crimes')
plt.show()

In [None]:
plt.figure(figsize=(8,10))
crimes.groupby([crimes['Location Description']]).size().sort_values(ascending=True).plot(kind='barh')
plt.title('Number of crimes by Location')
plt.ylabel('Crime Location')
plt.xlabel('Number of crimes')
plt.show()

In [None]:
days = ['Monday','Tuesday','Wednesday',  'Thursday', 'Friday', 'Saturday', 'Sunday']
crimes.groupby([crimes.index.dayofweek]).size().plot(kind='barh')
plt.ylabel('Days of the week')
plt.yticks(np.arange(7), days)
plt.xlabel('Number of crimes')
plt.title('Number of crimes by day of the week')
plt.show()

In [None]:
crimes.groupby([crimes.index.month]).size().plot(kind='barh')
plt.ylabel('Months of the year')
plt.xlabel('Number of crimes')
plt.title('Number of crimes by month of the year')
plt.show()

#Crimes rates seem to peak at summer months

In [None]:
plt.figure(figsize=(11,5))
crimes.resample('M').size().plot(legend=False)
plt.title('Number of crimes per month (2001 - 2024)')
plt.xlabel('Months')
plt.ylabel('Number of crimes')
plt.show()

#This chart shows a clear "periodic" pattern in the crimes over many years.

In [None]:
plt.figure(figsize=(11,4))
crimes.resample('D').size().rolling(365).sum().plot()
plt.title('Rolling sum of all crimes from 2005 - 2016')
plt.ylabel('Number of crimes')
plt.xlabel('Days')
plt.show()

#We see the line decreasing from 2006 up to some point around 2016 after which it stays around the same number of crimes. This all means that 2016 is really no better than 2015, but both years show a much better crime record (in total) than the previous years. Then there is sudden drop in number of cases around 2020, then is began to rize up again

In [None]:
crimes_count_date = crimes.pivot_table('ID', aggfunc=np.size, columns='Primary Type', index=crimes.index.date, fill_value=0)
crimes_count_date.index = pd.DatetimeIndex(crimes_count_date.index)
plo = crimes_count_date.rolling(365).sum().plot(figsize=(12, 30), subplots=True, layout=(-1, 3), sharex=False, sharey=False)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Extract the year from the 'date' column
crimes['year'] = pd.to_datetime(crimes['Date']).dt.year

# Group the data by year and Primary Type, and count the occurrences
crime_counts = crimes.groupby(['year', 'Primary Type']).size().reset_index(name='count')

# Pivot the data to create a wide format for plotting
crime_counts = crime_counts.pivot(index='year', columns='Primary Type', values='count').fillna(0)

# Create a color map for crime types
crime_colors = plt.cm.get_cmap('tab20', len(crime_counts.columns))
color_map = {col: crime_colors(i) for i, col in enumerate(crime_counts.columns)}

# Plot the data
fig, ax = plt.subplots(figsize=(12, 8))
crime_counts.plot(ax=ax, kind='line', color=[color_map[col] for col in crime_counts.columns])

# Add legend
legend_labels = crime_counts.columns.tolist()
legend_colors = [color_map[col] for col in crime_counts.columns]
ax.legend(legend_labels, bbox_to_anchor=(1.02, 1), loc='upper left', ncol=1, fancybox=True, shadow=True)

# Adjust plot settings
ax.set_xlabel('Year')
ax.set_ylabel('Crime Count')
ax.set_title('The trend of crime types across the years')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#Above graphs reflected an average trend toward decreasing crimes. But it is not the case. Some crime types are actually increasing all along like homicide and deceptive practice. Other types started to increase slightly before 2016 like theft, robbery and stalking (which may be the reason behind the trend we saw earlier).

In [None]:
import seaborn as sns

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming crimes is your DataFrame

# Select only the numeric columns
numeric_crimes = crimes.select_dtypes(include=['float64', 'int64'])

plt.figure(figsize=(10, 8))  # Set the figure size

# Compute the correlation matrix for numeric columns and plot the heatmap
sns.heatmap(data=numeric_crimes.corr(), annot=True, fmt=".2f", linewidth=0.5, cmap="Blues")

plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load your dataset
# crimes = pd.read_csv('/content/drive/MyDrive/bda/chicago_crime_dataset_bda.csv', parse_dates=['Date'])

# Convert the 'Date' column to datetime objects
crimes['Date'] = pd.to_datetime(crimes['Date'])

# Extract the year from the 'Date' column
crimes['Year'] = crimes['Date'].dt.year

# Filter data for the years 2006 to 2013
filtered_crimes = crimes[(crimes['Year'] >= 2006) & (crimes['Year'] <= 2013)]

# Group the filtered data by year and count the 'Primary Type' and sum the 'Arrested' column
grouped_crimes = filtered_crimes.groupby('Year')[['Primary Type', 'Arrest']].agg({'Primary Type': 'count', 'Arrest': 'sum'}).reset_index()

# Create a plot with two y-axes
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot the total crimes on the first y-axis
color = 'tab:blue'
ax1.set_xlabel('Year')
ax1.set_ylabel('Total Crimes', color=color)
ax1.bar(grouped_crimes['Year'], grouped_crimes['Primary Type'], color=color, label='Total Crimes')
ax1.tick_params(axis='y', labelcolor=color)

# Create a second y-axis for the number of arrests
ax2 = ax1.twinx()
color = 'tab:orange'
ax2.set_ylabel('Number of Arrests', color=color)
ax2.plot(grouped_crimes['Year'], grouped_crimes['Arrest'], color=color, marker='o', label='Number of Arrests')
ax2.tick_params(axis='y', labelcolor=color)

# Add legends
fig.tight_layout()
fig.legend(loc='upper left')

plt.title('Comparison of Total Crimes and Number of Arrests (2006-2013)')
plt.show()

##Model random forest

In [None]:
crimes['Primary Type'].unique()

In [None]:
crimes

In [None]:
crimes = crimes.drop(columns = ['Date', 'ID', 'Latitude', 'Longitude', 'year'])

In [None]:
crimes

In [None]:
crimes = crimes.drop(columns = ['Block'])

In [None]:
crimes = pd.get_dummies(crimes, columns=['Description', 'Location Description', 'Arrest', 'Domestic'], drop_first=True)

In [None]:
crimes

In [None]:
crimes = crimes[crimes['Primary Type'] != "HUMAN TRAFFICKING"]

In [None]:
crimes['Primary Type'].unique()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

X = crimes.drop('Primary Type',axis=1)
y = crimes['Primary Type']

# Function to display feature importance
def display_feature_importance(model, top_n=34,percentage=3, plot=True):
    # Fit the model
    model.fit(X, y)

    # Get feature importance
    feature_importance = model.feature_importances_
    feature_names = X.columns

    # Create a DataFrame for better visualization
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

    # Sort features by importance
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Calculate threshold based on percentage of the top feature importance
    threshold = percentage / 100 * feature_importance_df.iloc[0]['Importance']

    # Select features that meet the threshold
    selected_features = feature_importance_df[feature_importance_df['Importance'] >= threshold]['Feature'].tolist()

    if plot==True:
        # Set seaborn color palette to "viridis"
        sns.set(style="whitegrid", palette="viridis")

        # Display or plot the top features
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(top_n))
        plt.title('Feature Importance for {}'.format(type(model).__name__))
        plt.show()

        print("Selected Features at threshold {}%; {}".format(percentage,selected_features))

    # Add 'smoking' to the list of selected features
    selected_features.append('Primary Type')

    return selected_features

In [None]:
# from sklearn.preprocessing import label_binarize

# # Binarize y_test to get a better handle on classes
# classes = np.unique(y_train)
# y_test_binarized = label_binarize(y_test, classes=classes)

# print("Unique classes in training:", classes)
# print("Predicted probability shape:", adaboost_pred_proba.shape)
# print("Binarized y_test shape:", y_test_binarized.shape)


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Assuming you have your training data (X, y)
# Splitting the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42, stratify = y)

# List to store AUC scores for each trial percentage
auc_scores_adaboost = []

# List to store selected features for AdaBoost and trial percentage
selected_features_adaboost = []

# List of trial percentages
trial_percentages_adaboost = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 22, 26, 30]

# Loop over each trial percentage
for percentage_adaboost in trial_percentages_adaboost:
    # Get selected features for AdaBoost
    adaboost_selected_features = display_feature_importance(AdaBoostClassifier(), percentage=percentage_adaboost)

    # Append selected features to the list
    selected_features_adaboost.append(adaboost_selected_features)

    # Ensure selected features are present in training data
    adaboost_selected_features = [feature for feature in adaboost_selected_features if feature in X_train.columns]

    # Check if any features are selected
    if len(adaboost_selected_features) == 0:
        continue

    # Initialize and fit AdaBoost model on training data
    adaboost_model = AdaBoostClassifier()
    adaboost_model.fit(X_train[adaboost_selected_features], y_train)

    # Predict probabilities on the test set
    adaboost_pred_proba = adaboost_model.predict_proba(X_test[adaboost_selected_features])

    print(len(adaboost_pred_proba), len(y_test))

    # Calculate AUC score for each class
    auc_adaboost = roc_auc_score(y_test, adaboost_pred_proba, average='macro', multi_class = 'ovr')
    auc_scores_adaboost.append(auc_adaboost)

# Plotting for AdaBoost
fig, ax = plt.subplots(figsize=(12, 8))

# Plotting line for AdaBoost
plt.plot(trial_percentages_adaboost, auc_scores_adaboost, label='AdaBoost', marker='o', color='orange')

plt.xlabel('Trial Percentages')
plt.ylabel('AUC Score')
plt.title('AdaBoost Model Performance for Different Feature Selection Percentages')
plt.legend()
plt.show()

In [None]:
adaboost_model = AdaBoostClassifier()

# Display feature importance and plot
selected_features_adaboost = display_feature_importance(adaboost_model, top_n=30, percentage=0, plot = True)

In [None]:
pip install optuna

In [None]:
pip install cmaes

In [None]:
# # ADABOOST HYPERPARAMETER
# import optuna
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# import pandas as pd

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# # # Initialize the label encoder
# label_encoder = LabelEncoder()

# # Fit and transform the target variable
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_val_encoded = label_encoder.transform(y_val)

# # Define the objective function to optimize
# def objective(trial):
#     # Define hyperparameters to search
#     adaboost_params = {
#         'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
#         # Additional hyperparameters can be added based on your requirements
#     }

#     # Initialize DecisionTreeClassifier with parameters suggested by Optuna
#     tree_params = {
#         'splitter': trial.suggest_categorical('splitter', ['best', 'random']),
#         'max_depth': trial.suggest_int('max_depth', 1, 10),
#     }

#     base_estimator = DecisionTreeClassifier(**tree_params)

#     # Initialize AdaBoost classifier with the base estimator
#     model = AdaBoostClassifier(base_estimator=base_estimator, **adaboost_params, random_state=42)

#     # Fit the model
#     model.fit(X_train, y_train_encoded)

#     # Predict on the validation set
#     y_pred_encoded = model.predict(X_val)

#     # Decode the predictions back to original labels
#     y_pred = label_encoder.inverse_transform(y_pred_encoded)

#     # Calculate accuracy
#     accuracy = accuracy_score(y_val, y_pred)

#     return accuracy

# # Define study
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.CmaEsSampler())

# # Optimize hyperparameters
# study.optimize(objective, n_trials=20, show_progress_bar = True)

# best_params = study.best_params

# print(best_params)

In [None]:
# ADABOOST
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier

# adaboost_params = {'n_estimators': 367, 'learning_rate': 0.2687183224582479} #v1
# best_base_estimator_params = {'splitter': 'random', 'max_depth': 2} #v1

adaboost_params = {'n_estimators': 910, 'learning_rate': 0.8135287583324838}
best_base_estimator_params = {'splitter': 'best', 'max_depth': 10}
base_estimator = DecisionTreeClassifier(**best_base_estimator_params)
adaboost = AdaBoostClassifier(base_estimator=base_estimator, **adaboost_params, random_state=42)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_train)
# crimes[selected_features_adaboost].drop('Primary Type', axis = 1)
# Create and fit the AdaBoostClassifier
# crimes[selected_features_adaboost]
# selected_features_adaboost

model = AdaBoostClassifier(base_estimator=base_estimator, **adaboost_params, random_state=42)
model.fit(X_train, y)

# Make predictions on the test set
# y_pred_encoded = model.predict(test[[feature for feature in selected_features_adaboost if feature != 'Primary Type']])

In [None]:
from sklearn.metrics import classification_report
y_pred_encoded = model.predict(X_test)

# Decode predictions back to original labels
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# import optuna
# import xgboost as xgb
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# import pandas as pd

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# # # Initialize the label encoder
# label_encoder = LabelEncoder()

# # Fit and transform the target variable
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_val_encoded = label_encoder.transform(y_val)

# # Define the objective function to optimize
# def objective(trial):
#     # Define hyperparameters to search
#     params = {
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
#         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-8, 1.0),
#         'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
#         'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 1.0)
#     }

#     # Initialize XGBoost classifier
#     model = xgb.XGBClassifier(**params, eval_metric='mlogloss')

#     # Fit the model
#     model.fit(X_train, y_train_encoded)

#     # Predict on the validation set
#     y_pred_encoded = model.predict(X_val)

#     # Decode the predictions back to original labels
#     y_pred = label_encoder.inverse_transform(y_pred_encoded)

#     # Calculate accuracy
#     accuracy = accuracy_score(y_val, y_pred)

#     return accuracy

# # Define study
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.CmaEsSampler())

# # Optimize hyperparameters
# study.optimize(objective, n_trials=100, show_progress_bar = True)

# best_params = study.best_params
# best_params['eval_metric'] = 'mlogloss'

# print(best_params)

In [None]:
# best_params = {'max_depth': 4, 'learning_rate': 0.0779066477789779, 'n_estimators': 476, 'gamma': 0.0005911097817824864, 'min_child_weight': 0.00036671924431171436, 'subsample': 0.7209603514105916, 'colsample_bytree': 0.5365746437670638, 'reg_alpha': 8.135325864945739e-08, 'reg_lambda': 0.6636375034403076, 'eval_metric': 'mlogloss'}

In [None]:
# import xgboost as xgb
# y = label_encoder.fit_transform(y_train)
# final_model = xgb.XGBClassifier(**best_params)
# final_model.fit(X_train, y)

# y_pred_encoded = final_model.predict(X_test)

In [None]:
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report as cls_report

# Initialize the Logistic Regression classifier
logistic_regression_classifier = LogisticRegression(max_iter=1000)  # You can adjust max_iter as needed

# Train the Logistic Regression classifier
logistic_regression_classifier.fit(X_train, y_train)

# Predict the sentiment for test set using Logistic Regression
y_pred_logistic_regression = logistic_regression_classifier.predict(X_test)

# Evaluate the performance of the Logistic Regression classifier
accuracy_logistic_regression = accuracy_score(y_test, y_pred_logistic_regression)
classification_report_logistic_regression = cls_report(y_test, y_pred_logistic_regression)

# Print the evaluation results for Logistic Regression
print("Logistic Regression Model Evaluation:")
print("Accuracy:", accuracy_logistic_regression)
print("\nClassification Report:")
print(classification_report_logistic_regression)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Predict on the test data
rf_preds = rf_classifier.predict(X_test)

# Print classification report
print("Classification Report - Random Forest:")
print(classification_report(y_test, rf_preds))
