In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [2]:
def load_csv_to_dataframe():
    return pd.read_csv("/Users/reembeniluz/DataScienceProj/projects_table_data_final.csv")

In [3]:
def clean_location_col(df):
    state_shortcuts = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY"
}
    df["Location"] = df["Location"].apply(lambda x: 'USA' if any(state in x.split(', ') for state in state_shortcuts.values()) else x.split(', ')[-1].strip())
    return df

In [4]:
def classify_success_col(df):
    df['Success'] = 0
    df.loc[df['Percent Funded'] >= 100, 'Success'] = 1

In [5]:
def boolean_to_numbers(df):
    df['Is Project We Love'] = df['Is Project We Love'].map({False: 0, True: 1})

In [6]:
def targetCol_success_compare(target_col, top_n=20):
    success = df.groupby(target_col)['Success'].mean().sort_values(ascending=False)
    count = df.groupby(target_col)['Success'].count().loc[success.index]
    top_n_categories = success[(success < 1) | (count >= 5)][:top_n]

    fig = px.bar(top_n_categories, x=top_n_categories.index, y=top_n_categories.values,
                 color=top_n_categories.index, color_discrete_sequence=["#1984c5", "#22a7f0", "#63bff0", "#a7d5ed", "#e2e2e2", "#e1a692", "#de6e56", "#e14b31", "#c23728"])

    fig.update_layout(
        title='Success Rate by ' + target_col,
        xaxis=dict(title=target_col),
        yaxis=dict(title='Success Rate'),
        xaxis_tickangle=-45,
        showlegend=False
    )

    fig.show()

In [73]:
def create_heatMap():
    colors = ["#1984c5", "#22a7f0", "#63bff0", "#a7d5ed", "#e2e2e2", "#e1a692", "#de6e56", "#e14b31", "#c23728"]
    columns = ['Description Length', 'Goal Amount', 'Duration', 'Risk Desc Count', 'Image Count', 'Video Count', 'Is Project We Love', 'Success']

    # Add the 'Location' column count to the dataframe
    location_count = df['Location'].value_counts().reset_index()
    location_count.columns = ['Location', 'Country']
    df_with_location_count = df.merge(location_count, on='Location', how='left')

    # Add the 'Count' column to the columns list
    columns_with_location_count = columns + ['Country']

    # Calculate the correlation matrix
    corr_matrix = df_with_location_count[columns_with_location_count].corr()

    fig = px.imshow(corr_matrix, color_continuous_scale=colors)
    fig.update_layout(
        title='Correlation Heatmap',
        xaxis=dict(title='Columns'),
        yaxis=dict(title='Columns'),
        showlegend=False
    )

    fig.show()

In [74]:
def create_target_box_graph(target):
    colors = ["#63bff0","#e14b31"]
    fig = px.box(df, x='Success', y=target, color='Success', color_discrete_sequence=colors)

    fig.update_layout(
        title= target + ' Distribution by Project Success',
        xaxis=dict(title='Project Success'),
        yaxis=dict(title=target+'Count'),
        showlegend=False
    )

    fig.show()

In [75]:
def create_count_by_category_graph():
    colors = ["#22a7f0", "#63bff0", "#e2e2e2", "#e1a692", "#de6e56", "#e14b31"]

    counts = df['Parent Category'].value_counts().reset_index()
    counts.columns = ['Parent Category', 'Count']

    fig = px.bar(counts, x='Parent Category', y='Count', color='Parent Category', color_discrete_sequence=colors)

    fig.update_layout(
        title='Count of Projects by Parent Category',
        xaxis=dict(title='Parent Category'),
        yaxis=dict(title='Count'),
        showlegend=False,
    )

    fig.show()

In [114]:
def create_category_success_distribuition():
    success_by_category = df.groupby(['Parent Category', 'Success']).size().unstack()
    success_by_category = success_by_category.div(success_by_category.sum(axis=1), axis=0) * 100

    colors = ["#de6e56","#63bff0"]
    fig = px.bar(success_by_category, barmode='stack', color_discrete_sequence=colors)

    fig.update_layout(
        title='Distribution of Project Success by Parent Category',
        xaxis=dict(title='Parent Category'),
        yaxis=dict(title='Percentage'),
        legend=dict(title='Success', x=1.0, y=1.0),
        showlegend=True,
        xaxis_tickangle=-45
    )

    fig.show()

In [115]:
def create_category_love_distribution():
    success_by_category = df.groupby(['Parent Category', 'Is Project We Love']).size().unstack()
    success_by_category = success_by_category.div(success_by_category.sum(axis=1), axis=0) * 100

    colors = ["#de6e56","#63bff0"]
    fig = px.bar(success_by_category, barmode='stack', color_discrete_sequence=colors)

    fig.update_layout(
        title='Distribution of Project Success by Is Project We Love',
        xaxis=dict(title='Is Project We Love'),
        yaxis=dict(title='Percentage'),
        legend=dict(title='Success', x=1.0, y=1.0),
        showlegend=True,
        xaxis_tickangle=-45
    )

    fig.show()

In [116]:
def create_low_goal_percent_funded_graph():
    low_goal_df = df[df['Goal Amount'] < df['Goal Amount'].median()]
    success_by_category = low_goal_df.groupby('Parent Category')['Success'].mean().reset_index()
    median_values = low_goal_df.groupby('Parent Category')['Goal Amount'].median().reset_index()
    success_by_category['Median Goal Amount'] = median_values['Goal Amount']

    fig = px.bar(success_by_category, x='Parent Category', y='Success',
                 title="Success Rate by Parent Category (Low Goal Amount)",
                 labels={'Parent Category': 'Parent Category', 'Success': 'Success Rate'},
                 color_discrete_sequence=['#63bff0'])

    for i, row in success_by_category.iterrows():
        fig.add_annotation(
            x=row['Parent Category'],
            y=row['Success'],
            text=f"{row['Median Goal Amount']}",
            showarrow=False,
            font=dict(color='black'),
            yshift=10
        )

    fig.update_layout(showlegend=False)
    fig.show()


In [130]:
def create_high_goal_percent_funded_graph():
    high_goal_df = df[df['Goal Amount'] > df['Goal Amount'].median()]
    success_by_category = high_goal_df.groupby('Parent Category')['Success'].mean().reset_index()
    median_by_category = high_goal_df.groupby('Parent Category')['Goal Amount'].median().reset_index()
    success_by_category['Median Goal Amount'] = median_by_category['Goal Amount']
    fig = px.bar(success_by_category, x='Parent Category', y='Success',
                 title="Success Rate by Parent Category (High Goal Amount)",
                 labels={'Parent Category': 'Parent Category', 'Success': 'Success Rate'},
                 color_discrete_sequence=['#63bff0'])

    for i, row in success_by_category.iterrows():
        fig.add_annotation(
            x=row['Parent Category'],
            y=row['Success'],
            text=f"{row['Median Goal Amount']}",
            showarrow=False,
            font=dict(color='black'),
            yshift=10
        )

    fig.update_layout(showlegend=False)
    fig.show()

In [131]:
def print_graphs():
    create_count_by_category_graph()
    create_category_success_distribuition()
    create_category_love_distribution()
    targetCol_success_compare("Location")
    targetCol_success_compare("Currency")
    targetCol_success_compare("Parent Category")
    create_heatMap()
    create_target_box_graph('Image Count')
    create_target_box_graph('Video Count')
    create_target_box_graph('Description Length')
    create_target_box_graph('Risk Desc Count')
    create_low_goal_percent_funded_graph()
    create_high_goal_percent_funded_graph()



In [132]:
df = load_csv_to_dataframe()

# Calculate the IQR for 'Goal Amount' column
goal_amount_iqr = np.percentile(df['Goal Amount'], 75) - np.percentile(df['Goal Amount'], 25)
# Define the upper and lower bounds for outliers
lower_bound = np.percentile(df['Goal Amount'], 25) - 1.5 * goal_amount_iqr
upper_bound = np.percentile(df['Goal Amount'], 75) + 1.5 * goal_amount_iqr
# Filter the dataset to include only the rows within the IQR bounds
df = df[(df['Goal Amount'] >= lower_bound) & (df['Goal Amount'] <= upper_bound)]

goal_amount_iqr = np.percentile(df['Duration'], 75) - np.percentile(df['Duration'], 25)
# Define the upper and lower bounds for outliers
lower_bound = np.percentile(df['Duration'], 25) - 1.5 * goal_amount_iqr
upper_bound = np.percentile(df['Duration'], 75) + 1.5 * goal_amount_iqr
# Filter the dataset to include only the rows within the IQR bounds
df = df[(df['Duration'] >= lower_bound) & (df['Duration'] <= upper_bound)]

clean_location_col(df)
classify_success_col(df)
boolean_to_numbers(df)
print_graphs()

