In [89]:
import pandas as pd
import dash
from dash.dependencies import Input, Output, State
import dash_core_components as dcc
import dash_html_components as html
import dash_table
import plotly
from plotly import graph_objs as go
from plotly.graph_objs import *
from flask import Flask
import pandas as pd
import numpy as np
import os
import sqlite3
import copy

### Connecting to the Database

In [90]:
conn = sqlite3.connect("/Users/ankitkothari/Documents/COMPLETED_PROJECTS/H1B_data_analysis/us_h1b.db")

### Filtering Criteria

In [91]:
filter_query = ''' 
select  
h1b.Employer,
h1b2.Denials,
h1b2.Approvals,
h1b2.Fiscal_Year
from h1b left join
(
select 
Employer,
SUM(Initial_Denials) + SUM(Continuing_Denials) Denials,
count(DISTINCT Fiscal_Year) Fiscal_Year,
SUM(h1b.Initial_Approvals)+ SUM(h1b.Continuing_Approvals) Approvals
from h1b 
where h1b.Fiscal_Year !='2019'
group by 1
) h1b2 on h1b.Employer = h1b2.Employer
group by 1
having h1b2.Fiscal_Year>9 and h1b2.Denials>2 and h1b2.Approvals >50
;'''



pandas_filter_query = pd.read_sql_query(filter_query, conn)
pandas_filter_query.to_csv("/Users/ankitkothari/Documents/dash-app/pandas_filter_query1.csv")
pandas_filter_query['Denials']=pandas_filter_query['Denials'].astype(int)
print(pandas_filter_query.head())

              Employer  Denials  Approvals  Fiscal_Year
0          3A SOFT INC        3         82           10
1    3CORE SYSTEMS INC       22        163           10
2      3I INFOTECH INC      144       1486           10
3  3K TECHNOLOGIES LLC       13        215           10
4           3M COMPANY        5        240           10


### Initializing the DASH APP

In [92]:
app = dash.Dash()

app.css.append_css({'external_url': 'https://cdn.rawgit.com/plotly/dash-app-stylesheets/2d266c578d2a6e8850ebce48fdb52759b2aef506/stylesheet-oil-and-gas.css'})


### Drop Down Menu tp Select Employer

In [93]:
employer_class = [{'label': str(item),
                      'value': str(item)}
                     for item in pandas_filter_query['Employer'].unique()]

In [94]:
employer_class[0:4]

[{'label': '3A SOFT INC', 'value': '3A SOFT INC'},
 {'label': '3CORE SYSTEMS INC', 'value': '3CORE SYSTEMS INC'},
 {'label': '3I INFOTECH INC', 'value': '3I INFOTECH INC'},
 {'label': '3K TECHNOLOGIES LLC', 'value': '3K TECHNOLOGIES LLC'}]

### App Layout

In [95]:
app.layout = html.Div([
    html.Div(
        [
            html.H1(
                'H1B VISA TRENDS',
                style={'font-family': 'Helvetica',
                       "margin-top": "0",
                       "margin-bottom": "0",
                       "color":"black",
                       "width": "100%"},
                className='eight columns',
            ),
        ], className='row', style={'display': 'inline-block'}
    ),
    html.Div(
        [
            html.Div([
                 #dcc.Input(id='my-id', value='Choose your Employer:', type='text'),
                 html.P('Please select the Employer:'),
                 dcc.Dropdown(
                        id='employer',
                        options= employer_class,
                        multi=False,
                        value=None
            )
                ],
                className='eight columns',
                style={'margin-top': '10', 'margin-right': "0"}

            ),

        ], className='row',style={'width': '120%', 'display': 'inline-block'} ),
    html.Div(
        [

                dcc.Graph(
                id='bar-graph-2',
                style={"margin-right": "0"},
                className='five columns',
            ),
                dcc.Graph(
                id='bar-graph',
                style={"margin-left": "10"},
                className='five columns',
            ),
            html.H2('Data'),
            html.Div([
        html.P('1. The Data has been taken from the USCIS website.'),
        html.P('2. The Data has been cleaned and analyzed, so there may be inaccuracies'),
        html.P('3. This should not be treated as a source of truth'),
        html.P('4. New Approvals and Continuing Approvals are combined together.'),
        html.P('5. Employers who have used H1B program for atleast 8 fiscal years'),
        html.P('   are only counted.'),
    ])
        ], className='row',style={'width': '100%', 'display': 'inline-block'}),
    html.Div(
        [
            
                dcc.Graph(
                id='bar-graph-3',
                style={"margin-right": "0"},
                className='five columns',
            ),
                dcc.Graph(
                id='map-graph',
                style={"margin-right": "0"},
                className='five columns',
            ),
                html.H2('Connect'),
        dcc.Markdown('''
[**LinkedIn**](https://www.linkedin.com/in/ankit-kothari-510a9623/)

[**Code**](https://github.com/ankit-kothari/Data-Science-Journey/tree/master/Data%20Exploration%20Analysis%20and%20Visualization/H1B-Data-Analysis-master).
'''),
            html.Div([
        html.P('Please connect with me if you have any questions or if you like this')])
        ], className='row', style={'width': '100%', 'display': 'inline-block'}),
])


### Querying Approvals and Denials By Fiscal Year for each Employer

In [96]:
h1b_query20 = ''' 
with employer_filter as
(
select  
h1b.Employer, count(DISTINCT h1b.Fiscal_Year) Fiscal_Year,
h1b2.Denials
from h1b left join
(
select 
Employer,
SUM(Initial_Denials) + SUM(Continuing_Denials) Denials,
SUM(h1b.Initial_Approvals)+ SUM(h1b.Continuing_Approvals) Approvals
from h1b 
group by 1
) h1b2 on h1b.Employer = h1b2.Employer
where h1b.Fiscal_Year !='2019'
group by 1
having count(DISTINCT h1b.Fiscal_Year)>9 and h1b2.Denials>2 and h1b2.Approvals >50
)
select  
h1b.Fiscal_Year,h1b.Employer,
SUM(h1b.Initial_Approvals)+ SUM(h1b.Continuing_Approvals) Approvals, 
SUM(h1b.Initial_Denials)+SUM(h1b.Continuing_Denials) AS Denials
from employer_filter ef left join h1b on h1b.Employer=ef.Employer
where h1b.Fiscal_Year !='2019'
group by h1b.Fiscal_Year, h1b.Employer
'''

pandas_fiscal_year = pd.read_sql_query(h1b_query20, conn)
pandas_fiscal_year.to_csv("/Users/ankitkothari/Documents/dash-app/pandas_fiscal_year1.csv")
print(pandas_fiscal_year.head())

   Fiscal_Year             Employer  Approvals  Denials
0         2009          3A SOFT INC          2      0.0
1         2009    3CORE SYSTEMS INC          7      3.0
2         2009      3I INFOTECH INC         20      0.0
3         2009  3K TECHNOLOGIES LLC         16      1.0
4         2009           3M COMPANY         13      1.0


### Ploting Approvals and Denials By Fiscal Year for selected Employer

In [97]:
@app.callback(
    dash.dependencies.Output('bar-graph', 'figure'),
    [dash.dependencies.Input('employer', 'value')]
)


def fiscal_plot(employer=None):
  try: 
        employer=employer.upper() 
  except:
        employer=None
  if employer is not None:
    df21=pandas_fiscal_year[pandas_fiscal_year['Employer']==employer]
    df21=df21.groupby('Fiscal_Year').sum()
    df21=df21.reset_index()
    print(df21.head())
  else:
    employer='APPLE'
    df21=pandas_fiscal_year[pandas_fiscal_year['Employer']==employer]
    df21=df21.groupby('Fiscal_Year').sum()
    df21=df21.reset_index()
    print(df21.head())
    print(df21)
  fig = go.Figure()
  fig.add_trace(go.Bar(x=[x for x in df21.Fiscal_Year] , y=df21.Approvals,marker_color='#2677bb', name='Count of Approvals'))
  fig.add_trace(go.Scatter(x=[x for x in df21.Fiscal_Year], y=df21.Denials, mode='lines', name='Count of Denials', yaxis="y2", line=dict(color='#bfbabe', width=4)))
  fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)'
)
  fig.update_xaxes(
    dtick=1,showgrid=False
)
  fig.update_yaxes(
    showgrid=False
)
  fig.update_layout(title="Approvals and Denials by Fiscal year for {}".format(employer), 
                 legend=dict(x=.03,y=0.98, traceorder='reversed', font_size=12), 
                 width=800,
                 height=400,
                 uniformtext_minsize=8,
                 uniformtext_mode='hide',
                 yaxis=dict(
        title="Count of Approvals (Bar)",
        titlefont=dict(
            color="#151515"
        ),
        anchor="x",           
        tickfont=dict(
            color="#151515"
        )
    ),
    yaxis2=dict(
        title="Count of Denials (line)",
        titlefont=dict(
            color="#151515"
        ),
        tickfont=dict(
            color="#151515"
        ),
        anchor="x",
        side="right",
        zeroline=False,
        overlaying="y",
        position=1
    ),)
  fig.update_layout(
    plot_bgcolor='#e0e5db'
)
  fig.add_shape(
        # Rectangle reference to the axes
            type="rect",
            xref="x",
            yref="paper",
            x0='2016',
            y0=-0.01,
            x1='2018',
            y1=1.1,
            line=dict(
                color="#007500",
                width=5,
            ),
        )
  return fig

### Query to how you compare to national Average

In [98]:
h1b_query26 ='''
with national as
(
select  
SUM(Initial_Denials) + SUM(Continuing_Denials) AS Denials, 
SUM(Initial_Approvals)  + SUM(Continuing_Approvals) + SUM(Initial_Denials) + SUM(Continuing_Denials) AS Totals
from h1b
where Fiscal_Year !='2019'
),
employer as
(
select  
Employer,  SUM(Initial_Denials) + SUM(Continuing_Denials) AS Denials, 
SUM(Initial_Approvals)  + SUM(Continuing_Approvals) + SUM(Initial_Denials) + SUM(Continuing_Denials) AS Totals
from h1b
group by Employer
order by 3 desc
)
select 
employer.Employer,
CAST(national.Denials AS REAL)/ CAST(national.Totals AS REAL) AS national_average,
CAST(employer.Denials AS REAL)/ CAST(employer.Totals AS REAL) AS employer_average
from national, employer
;'''
  
pandas_health_query = pd.read_sql_query(h1b_query26, conn)
pandas_health_query.to_csv("/Users/ankitkothari/Documents/dash-app/pandas_health_query1.csv")

In [99]:
pandas_health_query.head()

Unnamed: 0,Employer,national_average,employer_average
0,COGNIZANT,0.067428,0.07769
1,INFOSYS,0.067428,0.04001
2,TATA,0.067428,0.065242
3,WIPRO,0.067428,0.063889
4,DELOITTE,0.067428,0.111768


### Ploting how you compare to national Average

In [100]:
@app.callback(
    dash.dependencies.Output('bar-graph-2', 'figure'),
    [dash.dependencies.Input('employer', 'value')]
)

def health(employer=None):
  try: 
    employer=employer.upper()
  except:
    employer=None
  if employer is not None:
    df35a = pandas_health_query[pandas_health_query['Employer']==employer]
  else:
    employer='APPLE'
    df35a = pandas_health_query[pandas_health_query['Employer']==employer]
  print(df35a)
  df35a = pd.melt(df35a, id_vars=['Employer'], value_vars=['national_average','employer_average'])
  df35a['value']=df35a['value'].apply(lambda x: round(x,2)*100)
  colors = ['#2677bb',] * 2
  colors[1] = '#007500'

  fig = go.Figure(data=[go.Bar(
    y=['National <br> (USA)', '{}'.format(employer)],
    x=[x  for  x in df35a['value']],
    width=.51,
    orientation='h',
    marker_color=colors, # marker color can be a single color value or an iterable
    text=[int(x)  for  x in df35a['value']],
    textposition='outside'# marker color can be a single color value or an iterable
)])
  fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)'
)
  fig.update_yaxes(
        tickangle = 360,
        tickfont=dict(family='Rockwell', color='#151515', size=14))
  fig.update_traces(marker_line_width=.5, opacity=0.9)
  fig.update_layout(title="How you compare with National Denial Rate", 
                 legend=dict(x=.73,y=0.98, traceorder='reversed', font_size=12), 
                 width=800,
                 height=400,
                 uniformtext_minsize=12,
                 xaxis=dict(
        title="H1B Visa Denial Rate %",
        titlefont=dict(
            color="#151515"
        ),
        tickfont=dict(
            color="#151515"
        )
    ),
    )
  return fig

### Query How you compare pre and post 2016 with other Employers

In [101]:
h1b_query21a= ''' 
with h1b_table_by_state AS
(
select  
h1b.Employer,   
SUM(h1b.Initial_Approvals)  + SUM(h1b.Continuing_Approvals) AS approvals_pre_2016, 
SUM(h1b.Initial_Denials) + SUM(h1b.Continuing_Denials) AS denials_pre_2016,
(CAST(SUM(h1b.Initial_Denials) AS REAL) + CAST(SUM(h1b.Continuing_Denials) AS REAL)) / (CAST(SUM(h1b.Initial_Denials) AS REAL) + CAST(SUM(h1b.Continuing_Denials) AS REAL)+CAST(SUM(h1b.Initial_Approvals) AS REAL) + CAST(SUM(h1b.Continuing_Approvals) AS REAL))*100 AS denial_pre_2016,
h1b2.Employer,
h1b2.approvals_post_2016, 
h1b2.denials_post_2016,
h1b2.denial_post_2016

from h1b LEFT JOIN (
    select  
       Employer,  
       SUM(Initial_Approvals)  + SUM(Continuing_Approvals) AS approvals_post_2016, 
       SUM(Initial_Denials) + SUM(Continuing_Denials) AS denials_post_2016,
       (CAST(SUM(Initial_Denials) AS REAL) + CAST(SUM(Continuing_Denials) AS REAL)) / (CAST(SUM(Initial_Denials) AS REAL) + CAST(SUM(Continuing_Denials) AS REAL)+CAST(SUM(Initial_Approvals) AS REAL) + CAST(SUM(Continuing_Approvals) AS REAL))*100 AS denial_post_2016,
       Fiscal_Year
       from h1b
       where Fiscal_Year !='2019' and Fiscal_Year>2016
       group by Employer

) h1b2 ON h1b.Employer = h1b2.Employer
where h1b.Fiscal_Year !='2019' and h1b.Fiscal_Year<=2016
group by h1b.Employer
), fiscal_count as
(
select  
Employer, count(DISTINCT h1b.Fiscal_Year) Fiscal_Year 
from h1b 
where h1b.Fiscal_Year !='2019'
group by 1
having count(DISTINCT h1b.Fiscal_Year)>9
)
select 
hs.Employer,
fc.Fiscal_Year,
hs.denial_pre_2016 AS denial_rate_pre_2016,
hs.denial_post_2016 AS denial_rate_post_2016,
hs.denial_post_2016 - hs.denial_pre_2016 AS delta_denial_rates_pre_post2016
from h1b_table_by_state hs  join fiscal_count fc on hs.Employer=fc.Employer
order by 4 desc
;
'''

pandas_compare_query = pd.read_sql_query(h1b_query21a, conn)
pandas_compare_query.to_csv("/Users/ankitkothari/Documents/dash-app/pandas_compare_query1.csv")
pandas_compare_query.shape

(2447, 5)

In [102]:
pandas_comparison_query=pandas_filter_query.merge(pandas_compare_query, how='left', left_on='Employer', right_on='Employer')
pandas_comparison_query.shape

(1325, 8)

### Plotting How you compare pre and post 2016 with other Employers

In [103]:
@app.callback(
    dash.dependencies.Output('bar-graph-3', 'figure'),
    [dash.dependencies.Input('employer', 'value')]
)


def compare_plot(employer):
  try: 
    employer=employer.upper()
  except:
    employer=None
  if employer is None:
    employer='APPLE'
  companies=["{}".format(employer),"APPLE","FACEBOOK","AMAZON","MICROSOFT","GOOGLE","TATA", "ACCENTURE", "WIPRO","CAPGEMINI","MINDTREE"]
  print(companies)
  df21=pandas_comparison_query
  df21['companies']= df21['Employer'].apply(lambda x: "US_COMPANY" if x in companies  else "NA")
  df21=df21[df21['companies'] != "NA"]
  df21=df21.sort_values(by=['denial_rate_post_2016'], ascending=True)
  df21[['denial_rate_pre_2016','denial_rate_post_2016','delta_denial_rates_pre_post2016']]=df21[['denial_rate_pre_2016','denial_rate_post_2016','delta_denial_rates_pre_post2016']].apply(lambda x: round(x,2))
  fig = go.Figure()
  print(df21)
  y1=[str(x)  for  x in df21['denial_rate_pre_2016']]
  y2=[str(x)  for  x in df21['denial_rate_post_2016']]
  #fig.add_trace(go.Bar(x=df20.Fiscal_Year , y=df20.Approvals, mode='markers+lines', name='JOB TIME', line=dict(color='#e4bd0b', width=2)))
  fig.add_trace(go.Bar(y=[x for x in df21.Employer] , x=df21.denial_rate_pre_2016,marker_color='#2677bb',orientation='h', name='Denial Rate Pre 2016', text=y1,
    textposition='outside'))
  fig.add_trace(go.Bar(y=[x for x in df21.Employer] , x=df21.denial_rate_post_2016,marker_color='#bfbabe',orientation='h', name='Denial Rate Post 2016',text=y2,
    textposition='outside'))
  #fig.add_trace(go.Scatter(x=[x for x in df20.Fiscal_Year], y=df20.Denials, mode='lines', name='Count of Denials', yaxis="y2", line=dict(color='#bfbabe', width=4)))
  fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)'
)
  fig.update_xaxes(
        tickangle = 0,
        tickfont=dict(family='Rockwell', color='#151515', size=16))

  fig.update_xaxes(
    dtick=2, showgrid=False
)
  fig.update_yaxes(
    dtick=1,showgrid=False
)
  fig.update_yaxes(ticks="outside", tickwidth=3, tickcolor='#e0e5db', ticklen=12)
  fig.update_layout(title="How you compare with other Employers?", 
                 legend=dict(x=.73,y=0.78, traceorder='reversed', font_size=12), 
                 width=600,
                 height=600,
                 yaxis=dict(
        title="",
        titlefont=dict(
            color="#151515"
        ),
        tickfont=dict(
            color="#151515"
        )
    ),
                  xaxis=dict(title="% Denial Rate",titlefont=dict(color="#151515"),
                            tickfont=dict(color="#151515")),)


  return fig

### Query Distribution of Approved Visa Across State

In [104]:
h1b_query35 = ''' 
select  
h1b.State,
h1b.Employer,
SUM(h1b.Initial_Approvals)  + SUM(h1b.Continuing_Approvals) AS total_visas_State
from h1b
where h1b.Fiscal_Year !='2019' and h1b.Employer in (
select  
h1b.Employer
from h1b left join
(
select 
distinct Employer,
SUM(Initial_Denials) + SUM(Continuing_Denials) Denials,
count(DISTINCT Fiscal_Year) Fiscal_Year,
SUM(h1b.Initial_Approvals)+ SUM(h1b.Continuing_Approvals) Approvals
from h1b 
where h1b.Fiscal_Year !='2019'
group by 1
) h1b2 on h1b.Employer = h1b2.Employer
group by 1
having h1b2.Fiscal_Year>9 and h1b2.Denials>2 and h1b2.Approvals >50)
group by 2,1
;'''


map_query = pd.read_sql_query(h1b_query35, conn)
map_query.to_csv("/Users/ankitkothari/Documents/dash-app/map_query1.csv")
map_query['total_visas_State']=map_query['total_visas_State'].astype(float)
map_query[map_query['Employer']=='ACCEL NORTH AMERICA INC']

Unnamed: 0,State,Employer,total_visas_State
24,CA,ACCEL NORTH AMERICA INC,84.0


### Plotting Distribution of Approved Visa Across State

In [105]:
@app.callback(
    dash.dependencies.Output('map-graph', 'figure'),
    [dash.dependencies.Input('employer', 'value')]
)
def update_graph(employer):
  try: 
    employer=employer.upper()
  except:
    employer=None
  if employer is None:
    employer='APPLE'
  df35 = map_query[map_query['Employer']==employer]
  print(df35)
  df35=df35.sort_values(by='total_visas_State', ascending=False)
  df35=df35.dropna(how='any')
  colors  = ["#2677bb" if x < 1000 else '#bfbabe' if x<=10000 else '#007500' for x in df35['total_visas_State']]
  print(colors)
  fig = go.Figure(data=go.Choropleth(
    locations=df35['State'], # Spatial coordinates, # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    z = df35['total_visas_State'].astype(float),
    showscale=False,
    colorbar = dict(showticklabels=False),
    colorscale = colors
    ,
    
))

  fig.update_layout(
    title_text = 'Approved H1B Applications for in US By States'.format(employer),
    geo_scope='usa', # limite map scope to USA
)

  return fig


In [None]:
if __name__ == '__main__':
    app.run_server()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [02/Oct/2020 03:08:41] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [02/Oct/2020 03:08:42] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [02/Oct/2020 03:08:42] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -


    State Employer  total_visas_State
173    CA    APPLE            13744.0
174    CT    APPLE                4.0
175    DC    APPLE                1.0
176    FL    APPLE                0.0
177    MA    APPLE                3.0
178    MI    APPLE                8.0
179    MN    APPLE               10.0
180    NC    APPLE                1.0
181    NJ    APPLE               26.0
182    NY    APPLE                8.0
183    OH    APPLE                7.0
184    PA    APPLE                3.0
185    TN    APPLE                2.0
186    TX    APPLE                2.0
187    VA    APPLE                4.0['APPLE', 'APPLE', 'FACEBOOK', 'AMAZON', 'MICROSOFT', 'GOOGLE', 'TATA', 'ACCENTURE', 'WIPRO', 'CAPGEMINI', 'MINDTREE']

['#007500', '#2677bb', '#2677bb', '#2677bb', '#2677bb', '#2677bb', '#2677bb', '#2677bb', '#2677bb', '#2677bb', '#2677bb', '#2677bb', '#2677bb', '#2677bb', '#2677bb']   Employer  national_average  employer_average
19    APPLE          0.067428          0.008554
   Fiscal_Ye

127.0.0.1 - - [02/Oct/2020 03:08:43] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [02/Oct/2020 03:08:43] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [02/Oct/2020 03:08:43] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [02/Oct/2020 03:08:43] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


       Employer  Denials  Approvals  Fiscal_Year_x  Fiscal_Year_y  \
422    FACEBOOK       49       8188             10             10   
96        APPLE      112      13823             10             10   
488      GOOGLE      135      18657             10             10   
730   MICROSOFT      438      36174             10             10   
59       AMAZON      323      23819             10             10   
1121       TATA     3932      75435             10             10   
14    ACCENTURE     1470      36740             10             10   
1283      WIPRO     3531      60012             10             10   
738    MINDTREE      344       5706             10             10   
202   CAPGEMINI     2762      16391             10             10   

      denial_rate_pre_2016  denial_rate_post_2016  \
422                   0.46                   0.73   
96                    0.55                   1.22   
488                   0.40                   1.35   
730                   1.03  