In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('webscrape.csv')
data.head()

In [None]:
data.drop('Rank', axis=1, inplace=True)

In [None]:
data.info()

In [None]:
data['Revenue (USD millions)'] = data['Revenue (USD millions)'].str.replace(',', '').astype('int')

In [None]:
data['Employees'] = data['Employees'].str.replace(',', '').astype('int')

In [None]:
data['Revenue growth'] = data['Revenue growth'].str.strip('%').astype('float')/100

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.duplicated(subset='Name').sum()

In [None]:
data.head()

In [None]:
blanks=[]
def city(data, feature):
    for x in data[feature]:
        blanks.append(x.split(',')[0])
    return blanks

In [None]:
blanks=[]
def state(data, feature):
    for x in data[feature]:
        value1 = 'D.C' in x
        value2 = 'Cook County' in x
        if value1 == True:
            blanks.append('Washington, D.C.')
        elif value2 == True:
            blanks.append(x.split(',')[2])
        else:
            blanks.append(x.split(',')[1])
        
    return blanks

In [None]:
data['City'] = city(data, 'Headquarters')

In [None]:
data['State'] = state(data, 'Headquarters')

In [None]:
data.drop('Headquarters', axis=1, inplace=True)

In [None]:
data['State'] = data['State'].str.replace(' ', '')

In [None]:
#General Data Visualization 

In [None]:
#Industry in Data Distribution
data['Industry'].value_counts().reset_index()

fig = px.pie(
    data['Industry'].value_counts().reset_index(),
    names='index',
    values='Industry',
    title='Industry in Data Distribution'
).update_traces(textposition='inside', textinfo='percent+label')\
.update_layout(
            title={
            'x':0.40,
            'y':0.92,
            'xanchor':'center'},
            width=1000, height = 650)

fig.show()

In [None]:
#Top 100 companies State Distribution

fig = px.pie(
    data.State.value_counts().reset_index(), 
    names='index',
    values='State',
    title='Top 100 Companies State Distribution'
).update_traces(textposition='inside', textinfo='percent+label')\
.update_layout(
            title={
            'x':0.46,
            'y':0.92,
            'xanchor':'center'},
            width=1000, height = 650)

fig.show()

In [None]:
from dash import Dash, html, dcc
from jupyter_dash import JupyterDash
from dash.dependencies import Output, Input
from dash.exceptions import PreventUpdate

In [None]:
data.describe()

In [None]:
#Histograms

for x in data.columns:
    if data[x].dtype != 'O':
        sns.histplot(data[x], kde=True) 
        plt.title(x)
        plt.show()

In [None]:
#Top 10 companies with most Revenue (USD millions)/Revenue growth/Employees
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown', 
                options=[{'label':x.title(), 'value':x}for x in data.columns if data[x].dtype != 'O'],
                value='Revenue (USD millions)'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(variable):
    df = data.sort_values(variable, ascending=False)[:10]
    
    fig = px.bar(
        df, 
        x=variable, 
        y='Name',
        color='Name',
        title = f"Top 10 Companies with Highest {variable.title()}",
        labels={'Name':'Company'},
        text_auto=True
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8074)

In [None]:
#Top 10 Industry with avg/sum with Revenue (USD millions)/Revenue growth/Employees

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown', 
                options=[{'label':x.title(), 'value':x}for x in data.columns if data[x].dtype != 'O'],
                value='Revenue (USD millions)'),
    dcc.Graph('visual1'), 
    dcc.Graph('visual2')
])

@app.callback(Output('visual1', 'figure'),
             Output('visual2', 'figure'), 
             Input('dropdown', 'value'))
def plot(variable):
    df1 = data.groupby('Industry', as_index=False).mean().sort_values(variable, ascending=False)[:10]
    df2 = data.groupby('Industry', as_index=False).sum().sort_values(variable, ascending=False)[:10]
    
    fig1 = px.bar(
        df1, 
        x=variable, 
        y='Industry',
        color='Industry',
        title = f"Top 10 Industries with Highest AVG {variable.title()}",
        labels={'Name':'Company'},
        text_auto=True
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    fig2 = px.bar(
        df2, 
        x=variable, 
        y='Industry',
        color='Industry',
        title = f"Top 10 Industries with Highest Total {variable.title()}",
        labels={'Name':'Company'},
        text_auto=True
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    return fig1, fig2
    
if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8075)
    

In [None]:
#Top 10 companies on each state based Revenue (USD millions) / Employee

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('State Selection:'),
    dcc.Dropdown(id='dropdown', 
                options=data.State.unique(), 
                value='California'),
    dcc.Graph(id='visual1'),
    dcc.Graph(id='visual2')
])

@app.callback(Output('visual1', 'figure'), 
              Output('visual2', 'figure'),
              Input('dropdown', 'value'))
def plot(state):
    df1 = data.query(f'State == "{state}"').sort_values('Revenue (USD millions)', ascending=False)[:10]
    df2 = data.query(f'State == "{state}"').sort_values('Employees', ascending=False)[:10]
    
    fig1 = px.bar(
        df1, 
        x='Revenue (USD millions)',
        y='Name',
        title=f"Top 10 Companies with Highest Revenue in {state}",
        labels={'Name':'Company'},
        color='Revenue growth',
        text_auto=True
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    fig2 = px.bar(
        df2, 
        x='Employees',
        y='Name',
        title=f"Top 10 Companies with Most Employee in {state}",
        labels={'Name':'Company', 'Employees':'Employees Cnt'},
        color='Revenue growth',
        text_auto=True
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    
    return fig1, fig2

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8076)

In [None]:
#Industory Distribution/Revenue Total in each state among Top 100 Companies
df = pd.DataFrame(data.groupby('State')['Industry'].value_counts())\
        .rename(columns = {'Industry': 'Count'}).reset_index()

dff = pd.merge(
    pd.DataFrame(data.groupby(['State', 'Industry'])['Revenue (USD millions)'].sum()).reset_index(),
    df,
    left_on = ['State', 'Industry'], 
    right_on=['State', 'Industry']
)


app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('State Selection'),
    dcc.Dropdown(id='dropdown', 
                options = df['State'].unique(),
                value='California'),
    dcc.Graph('visual1'),
    dcc.Graph('visual2')
])

@app.callback(Output('visual1', 'figure'),
              Output('visual2', 'figure'),
              Input('dropdown', 'value')
              )
def plot(state):
    dfm = dff.query(f'State == "{state}"').sort_values('Revenue (USD millions)', ascending=False)
    
    fig2 = px.bar(
        dfm,
        y='Industry', 
        x='Revenue (USD millions)',
        title=f"Industry Total Revnue in {state}",
        color='Industry',
        text_auto=True
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    fig1 = px.pie(
        dfm, 
        names = 'Industry',
        values = 'Count',
        title = f'Industory Distribution in {state} among Top 100 Companies'
    ).update_traces(textposition='inside', textinfo='percent+label')\
.update_layout(
            title={
            'x':0.40,
            'y':0.92,
            'xanchor':'center'},
            width=800, height = 600)
    
    return fig1, fig2

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8077)

In [None]:
#Industory Revenue Growth/Employees_count Total in each state among Top 100 Companies

df = data.groupby(['State', 'Industry'])['Revenue growth', 'Employees', 'Revenue (USD millions)'].sum().reset_index()\
     .rename(columns={'Employees':'Employee Cnt'})

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('State Selection:'), 
    dcc.Dropdown(id='dropdown',
                options=df.State.unique(),
                value='California'),
    dcc.Graph('visual1'), 
    dcc.Graph('visual2')
])

@app.callback(Output('visual1', 'figure'), 
             Output('visual2', 'figure'), 
             Input('dropdown', 'value'))
def plot(state):
    dfm = df.query(f'State == "{state}"')
    
    fig1 = px.bar(
        dfm.sort_values('Revenue growth', ascending=False),
        x='Revenue growth',
        y='Industry',
        color='Revenue (USD millions)',
        title=f"Industry Total Revnue Growth in {state}",
        text_auto=True
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    fig2 = px.bar(
        dfm.sort_values('Employee Cnt', ascending=False),
        x='Employee Cnt',
        y='Industry',
        color='Revenue growth',
        title=f"Industry Total Employee Cnt in {state}",
        text_auto=True
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    return fig1, fig2

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8078)
    
    
    
    

In [None]:
data

In [None]:
data.City.value_counts()

In [None]:
#Correlation

In [None]:
data.corr()

In [None]:
data.corr().columns

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown_x', 
                options=data.corr().columns,
                value='Revenue (USD millions)'),
    html.Br(),
    dcc.Dropdown(id='dropdown_y', 
                options=data.corr().columns,
                value='Employees'),
    html.Br(),
    dcc.RadioItems(id='items', 
                  options=['Trendline Off', 'Trendline On'], 
                  value='Trendline Off'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), 
              Input('dropdown_x', 'value'),
              Input('dropdown_y', 'value'), 
              Input('items', 'value'))
def plot(x, y, switch):
    
    fig = px.scatter(
        data,
        x=x,
        y=y,
        size='Revenue (USD millions)',
        trendline=None if switch == 'Trendline Off' else 'ols',
        title=f'{x.title()} vs. {y.title()}'
    )
    
    return fig 

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8079)

In [None]:
sns.pairplot(data)