Import relevant Libraries

In [1]:
import pandas as pd
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
sns.set_style('whitegrid')

Read all files

In [2]:
# Major file, inside here we have id_major	id_university	type	major_name	capacity columns
majors = pd.read_csv('./files/majors.csv')
print(majors.isna().sum()) #check for some null 
majors.sample(5)

Unnamed: 0       0
id_major         0
id_university    0
type             0
major_name       0
capacity         0
dtype: int64


Unnamed: 0.1,Unnamed: 0,id_major,id_university,type,major_name,capacity
1132,1132,5211174,521,science,AGROTEKNOLOGI,88
211,211,1411211,141,science,SISTEM INFORMASI,32
37,37,1111382,111,science,PERENCANAAN WILAYAH DAN KOTA,24
751,751,3561076,356,science,PENDIDIKAN TEKNIK ELEKTRO,24
3134,3134,9112106,911,humanities,BIMBINGAN KONSELING,13


In [3]:
# we have id_university and the name of the university 
universities = pd.read_csv('./files/universities.csv')
print(universities.isna().sum()) #check some null 
universities.sample(5)

Unnamed: 0         0
id_university      0
university_name    0
dtype: int64


Unnamed: 0.1,Unnamed: 0,id_university,university_name
53,53,383,UNIVERSITAS NEGERI SURABAYA
80,80,821,UNIVERSITAS KHAIRUN
6,6,121,UNIVERSITAS SUMATERA UTARA
23,23,321,UNIVERSITAS INDONESIA
65,65,613,ISI DENPASAR


In [4]:
# in here we have all the test score from humanities
score_humanities = pd.read_csv('./files/score_humanities.csv')
print(score_humanities.isna().sum()) #check some null 
score_humanities.sample(5)

Unnamed: 0              0
id_first_major          0
id_first_university     0
id_second_major         0
id_second_university    0
id_user                 0
score_eko               0
score_geo               0
score_kmb               0
score_kpu               0
score_kua               0
score_mat               0
score_ppu               0
score_sej               0
score_sos               0
dtype: int64


Unnamed: 0.1,Unnamed: 0,id_first_major,id_first_university,id_second_major,id_second_university,id_user,score_eko,score_geo,score_kmb,score_kpu,score_kua,score_mat,score_ppu,score_sej,score_sos
16504,16504,3232336,323,3232352,323,112453,433,463,519,492,402,498,514,383,598
55459,55459,3622234,362,3622033,362,304766,612,616,604,514,597,437,479,557,586
54228,54228,3212154,321,3332105,333,299266,700,444,707,630,671,404,428,571,595
21555,21555,3332264,333,3332225,333,144958,622,464,494,565,563,455,423,466,563
18100,18100,3832277,383,3812095,381,122354,561,572,505,375,393,587,559,722,343


In [5]:
# we have all the test score from science
score_science = pd.read_csv('./files/score_science.csv')
print(score_science.isna().sum()) #check some null 
score_science.sample(5)

Unnamed: 0              0
id_first_major          0
id_first_university     0
id_second_major         0
id_second_university    0
id_user                 0
score_bio               0
score_fis               0
score_kim               0
score_kmb               0
score_kpu               0
score_kua               0
score_mat               0
score_ppu               0
dtype: int64


Unnamed: 0.1,Unnamed: 0,id_first_major,id_first_university,id_second_major,id_second_university,id_user,score_bio,score_fis,score_kim,score_kmb,score_kpu,score_kua,score_mat,score_ppu
59389,59389,3341166,334,3731211,373,237212,403,590,604,628,691,574,345,578
22316,22316,3851112,385,3861041,386,99994,317,612,705,619,558,433,514,665
11583,11583,3611027,361,3611437,361,56858,447,590,818,592,489,542,429,427
70168,70168,3211216,321,3211197,321,281279,589,484,637,607,493,538,592,528
39209,39209,3361082,336,3361035,336,166098,572,532,448,523,491,502,459,561


there is no missing data in any of these files, so we can continue to merge some files

In [6]:
major_univ = pd.merge(left=universities[['id_university','university_name']],right=majors[['id_major','major_name','id_university','capacity']],on='id_university')
major_univ # we have merged our data, so it will be easier to be read

Unnamed: 0,id_university,university_name,id_major,major_name,capacity
0,111,UNIVERSITAS SYIAH KUALA,1111014,PENDIDIKAN DOKTER HEWAN,88
1,111,UNIVERSITAS SYIAH KUALA,1111022,TEKNIK SIPIL,64
2,111,UNIVERSITAS SYIAH KUALA,1111037,TEKNIK MESIN,48
3,111,UNIVERSITAS SYIAH KUALA,1111045,TEKNIK KIMIA,48
4,111,UNIVERSITAS SYIAH KUALA,1111053,ARSITEKTUR,48
...,...,...,...,...,...
3162,921,UNIVERSITAS PAPUA,9212011,EKONOMI PEMBANGUNAN,32
3163,921,UNIVERSITAS PAPUA,9212042,MANAJEMEN,32
3164,921,UNIVERSITAS PAPUA,9212057,AKUNTANSI,32
3165,921,UNIVERSITAS PAPUA,9212065,PENDIDIKAN BAHASA INDONESIA,16


for science data

In [7]:
df_science = pd.merge(left=score_science,right=major_univ,left_on=['id_first_major','id_first_university'],right_on=['id_major','id_university'],suffixes=('_left','_right')) # merge first major and first university
df_science = df_science.rename(columns={'university_name':'first_univ','major_name':'first_major','capacity':'first_capacity'}).drop(['id_university','id_major'],axis=1) # then rename the columns name that have been merged and also drop the id
df_science = pd.merge(left=df_science,right=major_univ,left_on=['id_second_major','id_second_university'],right_on=['id_major','id_university'],suffixes=('_left','_right')) #  now second major and second university
df_science = df_science.rename(columns={'university_name':'second_univ','major_name':'second_major','capacity':'second_capacity'}).drop(['id_university','id_major','Unnamed: 0','id_first_major','id_first_university','id_second_major','id_second_university','id_user'],axis=1) # same like before

In [8]:
df_science # now we can just use a single file to get all of the information

Unnamed: 0,score_bio,score_fis,score_kim,score_kmb,score_kpu,score_kua,score_mat,score_ppu,first_univ,first_major,first_capacity,second_univ,second_major,second_capacity
0,400,400,400,400,400,400,400,400,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,64,UNIVERSITAS PADJADJARAN,PSIKOLOGI,80
1,704,447,630,561,518,541,585,599,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,64,UNIVERSITAS PADJADJARAN,PSIKOLOGI,80
2,715,532,633,579,732,804,402,608,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,64,UNIVERSITAS PADJADJARAN,PSIKOLOGI,80
3,446,511,400,548,679,567,513,538,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,64,UNIVERSITAS PADJADJARAN,PSIKOLOGI,80
4,489,533,367,481,487,544,499,469,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,64,UNIVERSITAS PADJADJARAN,PSIKOLOGI,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86223,336,466,476,511,378,393,589,470,UNIVERSITAS NEGERI GORONTALO,PENDIDIKAN GEOGRAFI,54,UNIVERSITAS NEGERI GORONTALO,PENDIDIKAN ILMU PENGETAHUAN ALAM,27
86224,520,483,654,545,606,722,425,495,UNIVERSITAS BORNEO TARAKAN,AKUAKULTUR,48,UNIVERSITAS BORNEO TARAKAN,AGRIBISNIS,48
86225,435,413,441,408,516,526,564,390,UNIVERSITAS SYIAH KUALA,AGROTEKNOLOGI (PSDKU GAYO LUES),20,UNIVERSITAS SYIAH KUALA,KEHUTANAN (PSDKU GAYO LUES),16
86226,482,516,453,546,569,597,587,633,UNIVERSITAS PALANGKARAYA,KIMIA,30,UNIVERSITAS PALANGKARAYA,FISIKA,30


In [9]:
# because the capacity of utbk is 40% from the maximum capacity so we need to change the capacity value
df_science['first_capacity'] = df_science['first_capacity'].apply(lambda x: int(x*40/100))
df_science['second_capacity'] = df_science['second_capacity'].apply(lambda x: int(x*40/100))

In [10]:
df_science['specialized_score']=df_science[['score_bio','score_fis','score_kim','score_mat']].mean(axis=1) #get avg score 
df_science['general_score'] = df_science[['score_kmb','score_kpu','score_kua','score_ppu']].mean(axis=1)  #get avg score
df_science['average_score'] = df_science.iloc[:,:8].mean(axis=1) # avg score

In [11]:
df_science.columns.values

array(['score_bio', 'score_fis', 'score_kim', 'score_kmb', 'score_kpu',
       'score_kua', 'score_mat', 'score_ppu', 'first_univ', 'first_major',
       'first_capacity', 'second_univ', 'second_major', 'second_capacity',
       'specialized_score', 'general_score', 'average_score'],
      dtype=object)

In [12]:
df_science = df_science[['score_bio', 'score_fis', 'score_kim','score_mat', 'score_kmb', 'score_kpu','score_kua', 'score_ppu','specialized_score','general_score','average_score','first_univ', 'first_major','first_capacity', 'second_univ', 'second_major', 'second_capacity']] #rearrange columns index 

In [13]:
df_science

Unnamed: 0,score_bio,score_fis,score_kim,score_mat,score_kmb,score_kpu,score_kua,score_ppu,specialized_score,general_score,average_score,first_univ,first_major,first_capacity,second_univ,second_major,second_capacity
0,400,400,400,400,400,400,400,400,400.00,400.00,400.000,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,25,UNIVERSITAS PADJADJARAN,PSIKOLOGI,32
1,704,447,630,585,561,518,541,599,591.50,554.75,573.125,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,25,UNIVERSITAS PADJADJARAN,PSIKOLOGI,32
2,715,532,633,402,579,732,804,608,570.50,680.75,625.625,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,25,UNIVERSITAS PADJADJARAN,PSIKOLOGI,32
3,446,511,400,513,548,679,567,538,467.50,583.00,525.250,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,25,UNIVERSITAS PADJADJARAN,PSIKOLOGI,32
4,489,533,367,499,481,487,544,469,472.00,495.25,483.625,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,25,UNIVERSITAS PADJADJARAN,PSIKOLOGI,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86223,336,466,476,589,511,378,393,470,466.75,438.00,452.375,UNIVERSITAS NEGERI GORONTALO,PENDIDIKAN GEOGRAFI,21,UNIVERSITAS NEGERI GORONTALO,PENDIDIKAN ILMU PENGETAHUAN ALAM,10
86224,520,483,654,425,545,606,722,495,520.50,592.00,556.250,UNIVERSITAS BORNEO TARAKAN,AKUAKULTUR,19,UNIVERSITAS BORNEO TARAKAN,AGRIBISNIS,19
86225,435,413,441,564,408,516,526,390,463.25,460.00,461.625,UNIVERSITAS SYIAH KUALA,AGROTEKNOLOGI (PSDKU GAYO LUES),8,UNIVERSITAS SYIAH KUALA,KEHUTANAN (PSDKU GAYO LUES),6
86226,482,516,453,587,546,569,597,633,509.50,586.25,547.875,UNIVERSITAS PALANGKARAYA,KIMIA,12,UNIVERSITAS PALANGKARAYA,FISIKA,12


for humanities data

In [14]:
df_humanites = pd.merge(left=score_humanities,right=major_univ,left_on=['id_first_major','id_first_university'],right_on=['id_major','id_university'],suffixes=('_left','_right')) # merge first major and first university
df_humanites = df_humanites.rename(columns={'university_name':'first_univ','major_name':'first_major','capacity':'first_capacity'}).drop(['id_university','id_major'],axis=1) # then rename the columns name that have been merged and also drop the id
df_humanites = pd.merge(left=df_humanites,right=major_univ,left_on=['id_second_major','id_second_university'],right_on=['id_major','id_university'],suffixes=('_left','_right')) #  now second major and second university
df_humanites = df_humanites.rename(columns={'university_name':'second_univ','major_name':'second_major','capacity':'second_capacity'}).drop(['id_university','id_major','Unnamed: 0','id_first_major','id_first_university','id_second_major','id_second_university','id_user'],axis=1) # same like before

In [15]:
df_humanites.head(3)

Unnamed: 0,score_eko,score_geo,score_kmb,score_kpu,score_kua,score_mat,score_ppu,score_sej,score_sos,first_univ,first_major,first_capacity,second_univ,second_major,second_capacity
0,778,486,679,594,643,686,559,392,676,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),100,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,30
1,803,638,659,696,685,494,576,676,694,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),100,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,30
2,601,560,575,611,629,509,607,775,629,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),100,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,30


In [16]:
# because the capacity of utbk is 40% from the maximum capacity so we need to change the capacity value
df_humanites['first_capacity'] = df_humanites['first_capacity'].apply(lambda x: int(x*40/100))
df_humanites['second_capacity'] = df_humanites['second_capacity'].apply(lambda x: int(x*40/100))

In [17]:
df_humanites.columns

Index(['score_eko', 'score_geo', 'score_kmb', 'score_kpu', 'score_kua',
       'score_mat', 'score_ppu', 'score_sej', 'score_sos', 'first_univ',
       'first_major', 'first_capacity', 'second_univ', 'second_major',
       'second_capacity'],
      dtype='object')

In [18]:
df_humanites['specialized_score']=df_humanites[['score_eko','score_geo','score_sej','score_sos']].mean(axis=1) #get avg score 
df_humanites['general_score'] = df_humanites[['score_kmb','score_kpu','score_kua','score_ppu','score_mat']].mean(axis=1)  #get avg score
df_humanites['average_score'] = df_humanites.iloc[:,:8].mean(axis=1) # avg score

In [19]:
df_humanites = df_humanites[['score_eko', 'score_geo', 'score_sej','score_sos','score_mat', 'score_kmb', 'score_kpu','score_kua', 'score_ppu','specialized_score','general_score','average_score','first_univ', 'first_major','first_capacity', 'second_univ', 'second_major', 'second_capacity']] #rearrange columns index 

In [20]:
df_humanites.head()

Unnamed: 0,score_eko,score_geo,score_sej,score_sos,score_mat,score_kmb,score_kpu,score_kua,score_ppu,specialized_score,general_score,average_score,first_univ,first_major,first_capacity,second_univ,second_major,second_capacity
0,778,486,392,676,686,679,594,643,559,583.0,632.2,602.125,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),40,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,12
1,803,638,676,694,494,659,696,685,576,702.75,622.0,653.375,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),40,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,12
2,601,560,775,629,509,575,611,629,607,641.25,586.2,608.375,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),40,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,12
3,612,494,617,548,548,524,549,589,508,567.75,543.6,555.125,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),40,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,12
4,493,573,474,560,641,586,582,504,608,525.0,584.2,557.625,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),40,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,12


visualization

In [21]:
a = df_science.drop(['second_univ',	'second_major',	'second_capacity'],axis=1).groupby('first_univ').size().sort_values(ascending=False).reset_index().head(10).rename(columns={0:'Total','first_univ':'Universities'}) #DF for science

b = df_humanites.drop(['second_univ',	'second_major',	'second_capacity'],axis=1).groupby('first_univ').size().sort_values(ascending=False).reset_index().head(10).rename(columns={0:'Total','first_univ':'Universities'}) #DF for humanit

In [22]:
fig = px.bar(data_frame=a,y='Universities',x='Total',color='Universities',text_auto=True,pattern_shape_sequence=["\\"],width=1300, color_discrete_sequence=px.colors.qualitative.Bold)
fig.update_layout(title='Top Universities Choices 2019 Based on First Choice for Science Majors')
fig.update_yaxes(tickangle=0, tickfont=dict(family='Rockwell', color='black', size=12))
fig.update_xaxes(tickangle=0, tickfont=dict(family='Rockwell', color='red', size=12))
fig.update_traces(textfont_size=11, textangle=0, textposition="outside", cliponaxis=True)
fig.show()

fig = px.bar(data_frame=b,y='Universities',x='Total',color='Universities',text_auto=True,pattern_shape_sequence=["/"],width=1300)
fig.update_layout(title='Top Universities Choices 2019 Based on First Choice for Humanities Majors')
fig.update_yaxes(tickangle=0, tickfont=dict(family='Rockwell', color='black', size=12))
fig.update_xaxes(tickangle=0, tickfont=dict(family='Rockwell', color='red', size=12))
fig.update_traces(textfont_size=11, textangle=0, textposition="outside", cliponaxis=True)
fig.show()

![Top Universities Choices 2019 Based on First Choice for Science Majors](photos/Top%20Universities%20Choices%202019%20Based%20on%20First%20Choice%20for%20Science%20Majors.png)`

![Top Universities Choices 2019 Based on First Choice for Humanities Majors](photos/Top%20Universities%20Choices%202019%20Based%20on%20First%20Choice%20for%20Humanities%20Majors.png)

In [23]:
df_univ_brawSc = df_science.loc[(df_science['first_univ']=='UNIVERSITAS BRAWIJAYA')].drop(['second_univ',	'second_major'	,'second_capacity'],axis=1)
df_univ_brawHu = df_humanites.loc[(df_humanites['first_univ']=='UNIVERSITAS BRAWIJAYA')].drop(['second_univ',	'second_major'	,'second_capacity'],axis=1)

In [24]:
a = df_univ_brawSc.groupby('first_major').agg({'first_univ':'count','first_capacity':'mean'}).sort_values('first_univ',ascending=True).reset_index().rename(columns={'first_univ':'Total choices','first_capacity':'Capacity'})
b = df_univ_brawHu.groupby('first_major').agg({'first_univ':'count','first_capacity':'mean'}).sort_values('first_univ',ascending=True).reset_index().rename(columns={'first_univ':'Total choices','first_capacity':'Capacity'})

In [25]:
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']
fig = px.pie(values=[len(a),len(b)],names=['Science Major','Humanities Major'],height=500,width=700,hole=.2)
fig.update_traces(textposition='inside', textinfo='value+label',hoverinfo='label+percent',marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.update_layout(title='Total Majors in UB')
fig.show()

![Total Majors](photos/Total%20Majors.png))

In [26]:
#science
fig = px.bar(data_frame=a,y='first_major',x=['Capacity','Total choices'],barmode='group',height=1400,text_auto=True,pattern_shape_sequence=["x"],)
fig.update_layout(title='UB Science majors based on the first choice 2019',xaxis_title='',yaxis_title='Majors')
fig.update_traces(textfont_size=36, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

#humanities
fig = px.bar(data_frame=b,y='first_major',x=['Capacity','Total choices'],barmode='group',height=1000,text_auto=True,pattern_shape_sequence=["x"],color_discrete_map={
        'Total choicess': 'purple',
        'some_other_group': 'green'
    })
fig.update_layout(title='UB Humanities majors based on the first choice 2019',xaxis_title='',yaxis_title='Majors')
fig.update_traces(textfont_size=16, textangle=0, textposition="outside", cliponaxis=True)
fig.show()

![UB Science majors based on the first choice 2019](photos/UB%20Science%20majors%20based%20on%20the%20first%20choice%202019.png)

!['UB Humanities majors based on the first choice 2019'](photos/UB%20Humanities%20majors%20based%20on%20the%20first%20choice%202019.png)

In [27]:
a['Percentage'] = (a['Capacity']/a['Total choices'])*100
a['Percentage'] = a['Percentage'].apply(lambda x: f"{x:.3f}")
a = a.sort_values('Percentage',ascending=True,ignore_index=True)
b['Percentage'] = (b['Capacity']/b['Total choices'])*100
b['Percentage'] = b['Percentage'].apply(lambda x: f"{x:.3f}")
b = b.sort_values('Percentage',ascending=True,ignore_index=True)

In [28]:
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']
fig = px.pie(values=[9.091,9.639,10.870,11.290,11.515,12.556,12.821,13.270,13.913,13.990],names=['ILMU GIZI','PENDIDIKAN DOKTER GIGI','PENDIDIKAN DOKTER','TEKNIK SIPIL','FARMASI','PERENCANAAN WILAYAH & KOTA','TEKNIK LINGKUNGAN','ARSITEKTUR','TEKNIK INDUSTRI','ILMU DAN TEKNOLOGI PANGAN'],height=600,width=1000)
fig.update_traces(textposition='inside', textinfo='value+label',marker=dict(colors=colors, line=dict(color='#000000', width=1)))
fig.update_layout(title='Percentage Strictest Science Majors')
fig.show()

fig = px.pie(values=b['Percentage'].head(10),names=b['first_major'].head(10),height=600,width=1000)
fig.update_traces(textposition='inside', textinfo='value+label',marker=dict(line=dict(color='#000000', width=1)))
fig.update_layout(title='Percentage Strictest Humanities Majors')
fig.show()

![Percentage Strictest Science Majors](photos/Percentage%20Strictest%20Science%20Majors.png)

![Percentage Strictest Humanities Majors](photos/Percentage%20Strictest%20Humanities%20Majors.png)

![Science Major UB Top Choices 2019](/photos/Science%20Major%20UB%20Top%20Choices%202019.png)

In [29]:
df_univ_brawSc.columns

Index(['score_bio', 'score_fis', 'score_kim', 'score_mat', 'score_kmb',
       'score_kpu', 'score_kua', 'score_ppu', 'specialized_score',
       'general_score', 'average_score', 'first_univ', 'first_major',
       'first_capacity'],
      dtype='object')

In [30]:
def find_the_lowest_score(df):
    df_out = pd.DataFrame()
    for key,subdf in df.sort_values('average_score',ascending=False).groupby('first_major'):
        capacity = subdf['first_capacity'].iloc[0]
        data = subdf['average_score'].iloc[:capacity]
        data_specialized_score = subdf['specialized_score'].iloc[:capacity]
        data_general_score = subdf['general_score'].iloc[:capacity]
        major = subdf['first_major'].iloc[:capacity]
        data = pd.concat([data_general_score,data_specialized_score,data,major,],axis=1)
        t = pd.DataFrame(data)
        df_out = pd.concat([df_out,t],ignore_index=True)
    return df_out

In [31]:
df_univ_brawSc_clear = find_the_lowest_score(df_univ_brawSc)
df_univ_brawHm_clear = find_the_lowest_score(df_univ_brawHu)

In [32]:
fig = go.Figure()
fig.add_trace(go.Box(y=df_univ_brawSc_clear['specialized_score'], name='specialized_score',
                marker_color = 'indianred'))
fig.add_trace(go.Box(y=df_univ_brawSc_clear['general_score'], name = 'general_score',
                marker_color = 'lightseagreen'))
fig.add_trace(go.Box(y=df_univ_brawSc_clear['average_score'], name = 'average_score',
                marker_color = 'mediumvioletred'))
fig.update_layout(title_text='UB Distribution scores for Science Major 2019')
fig.show()

fig = go.Figure()
fig.add_trace(go.Box(y=df_univ_brawHm_clear['specialized_score'], name='specialized_score',
                marker_color = 'indianred'))
fig.add_trace(go.Box(y=df_univ_brawHm_clear['general_score'], name = 'general_score',
                marker_color = 'lightseagreen'))
fig.add_trace(go.Box(y=df_univ_brawHm_clear['average_score'], name = 'average_score',
                marker_color = 'mediumvioletred'))
fig.update_layout(title_text='UB Distribution scores for Humanities Major 2019')
fig.show()

![UB Distribution scores for Science Major 2019](/photos/UB%20Distribution%20scores%202019.png)

![UB Distribution scores for Humanities Major 2019](photos/UB%20Distribution%20scores%20for%20Humanities%20Major%202019.png)

In [33]:
df_humanites

Unnamed: 0,score_eko,score_geo,score_sej,score_sos,score_mat,score_kmb,score_kpu,score_kua,score_ppu,specialized_score,general_score,average_score,first_univ,first_major,first_capacity,second_univ,second_major,second_capacity
0,778,486,392,676,686,679,594,643,559,583.00,632.2,602.125,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),40,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,12
1,803,638,676,694,494,659,696,685,576,702.75,622.0,653.375,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),40,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,12
2,601,560,775,629,509,575,611,629,607,641.25,586.2,608.375,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),40,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,12
3,612,494,617,548,548,524,549,589,508,567.75,543.6,555.125,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),40,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,12
4,493,573,474,560,641,586,582,504,608,525.00,584.2,557.625,INSTITUT TEKNOLOGI BANDUNG,FAKULTAS SENIRUPA DAN DESAIN (FSRD),40,UNIVERSITAS SEBELAS MARET,DESAIN KOMUNIKASI VISUAL,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60989,287,423,453,457,498,488,447,501,428,405.00,472.4,440.625,UNIVERSITAS NUSA CENDANA,BIMBINGAN KONSELING,27,UNIVERSITAS NUSA CENDANA,PEND. ANAK USIA DINI,18
60990,469,541,572,435,618,578,416,446,331,504.25,477.8,496.375,UNIVERSITAS NEGERI GORONTALO,PENDIDIKAN JASMANI KESEHATAN DAN REKREASI,36,UNIVERSITAS NEGERI GORONTALO,PENDIDIKAN KEPELATIHAN,21
60991,612,455,434,501,527,482,453,408,447,500.50,463.4,477.250,UNIVERSITAS PALANGKARAYA,PENDIDIKAN SENDRATASIK,9,UNIVERSITAS PALANGKARAYA,PEND. GURU PEND ANAK USIA DINI (PG PAUD),13
60992,511,351,616,373,493,417,412,463,434,462.75,443.8,462.125,UNIVERSITAS TADULAKO,SOSIOLOGI,42,UNIVERSITAS TADULAKO,PEND. GURU SEKOLAH DASAR,30


In [35]:
df_science

Unnamed: 0,score_bio,score_fis,score_kim,score_mat,score_kmb,score_kpu,score_kua,score_ppu,specialized_score,general_score,average_score,first_univ,first_major,first_capacity,second_univ,second_major,second_capacity
0,400,400,400,400,400,400,400,400,400.00,400.00,400.000,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,25,UNIVERSITAS PADJADJARAN,PSIKOLOGI,32
1,704,447,630,585,561,518,541,599,591.50,554.75,573.125,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,25,UNIVERSITAS PADJADJARAN,PSIKOLOGI,32
2,715,532,633,402,579,732,804,608,570.50,680.75,625.625,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,25,UNIVERSITAS PADJADJARAN,PSIKOLOGI,32
3,446,511,400,513,548,679,567,538,467.50,583.00,525.250,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,25,UNIVERSITAS PADJADJARAN,PSIKOLOGI,32
4,489,533,367,499,481,487,544,469,472.00,495.25,483.625,INSTITUT TEKNOLOGI BANDUNG,SEKOLAH ILMU & TEKNO. HAYATI - PROG. SAINS,25,UNIVERSITAS PADJADJARAN,PSIKOLOGI,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86223,336,466,476,589,511,378,393,470,466.75,438.00,452.375,UNIVERSITAS NEGERI GORONTALO,PENDIDIKAN GEOGRAFI,21,UNIVERSITAS NEGERI GORONTALO,PENDIDIKAN ILMU PENGETAHUAN ALAM,10
86224,520,483,654,425,545,606,722,495,520.50,592.00,556.250,UNIVERSITAS BORNEO TARAKAN,AKUAKULTUR,19,UNIVERSITAS BORNEO TARAKAN,AGRIBISNIS,19
86225,435,413,441,564,408,516,526,390,463.25,460.00,461.625,UNIVERSITAS SYIAH KUALA,AGROTEKNOLOGI (PSDKU GAYO LUES),8,UNIVERSITAS SYIAH KUALA,KEHUTANAN (PSDKU GAYO LUES),6
86226,482,516,453,587,546,569,597,633,509.50,586.25,547.875,UNIVERSITAS PALANGKARAYA,KIMIA,12,UNIVERSITAS PALANGKARAYA,FISIKA,12


In [36]:
def find_the_lowest_score2(df):
    df_out = pd.DataFrame()
    for key,subdf in df.sort_values('average_score',ascending=False).groupby('first_major'):
        capacity = subdf['first_capacity'].iloc[0]
        data = subdf['score_sej'].iloc[:capacity]
        data_score_eko = subdf['score_eko'].iloc[:capacity]
        data_score_geo = subdf['score_geo'].iloc[:capacity]
        data_score_mat = subdf['score_mat'].iloc[:capacity]
        data_score_kmb = subdf['score_kmb'].iloc[:capacity]
        data_score_kpu = subdf['score_kpu'].iloc[:capacity]
        data_score_kua = subdf['score_kua'].iloc[:capacity]
        data_score_ppu = subdf['score_ppu'].iloc[:capacity]
        major = subdf['score_sos'].iloc[:capacity]
        data = pd.concat([data_score_geo,data_score_eko,data,major,data_score_mat,data_score_kmb,data_score_kpu,data_score_ppu,data_score_kua],axis=1)
        t = pd.DataFrame(data)
        df_out = pd.concat([df_out,t],ignore_index=True)
    return df_out

In [37]:
def find_the_lowest_score3(df):
    df_out = pd.DataFrame()
    for key,subdf in df.sort_values('average_score',ascending=False).groupby('first_major'):
        capacity = subdf['first_capacity'].iloc[0]
        data = subdf['score_bio'].iloc[:capacity]
        data_score_eko = subdf['score_fis'].iloc[:capacity]
        data_score_geo = subdf['score_kim'].iloc[:capacity]
        data_score_mat = subdf['score_mat'].iloc[:capacity]
        data_score_kmb = subdf['score_kmb'].iloc[:capacity]
        data_score_kpu = subdf['score_kpu'].iloc[:capacity]
        data_score_kua = subdf['score_kua'].iloc[:capacity]
        data_score_ppu = subdf['score_ppu'].iloc[:capacity]
        data = pd.concat([data_score_geo,data_score_eko,data,data_score_mat,data_score_kmb,data_score_kpu,data_score_ppu,data_score_kua],axis=1)
        t = pd.DataFrame(data)
        df_out = pd.concat([df_out,t],ignore_index=True)
    return df_out

In [38]:
df_distHm = find_the_lowest_score2(df_humanites)
df_distSc = find_the_lowest_score3(df_science)

In [39]:
fig = px.box(data_frame=df_distHm,y=df_distHm.columns.values,color_discrete_sequence=px.colors.qualitative.Vivid)
fig.update_layout(title='Distribution Each Subject in Humanities',xaxis_title='')
fig.show()

fig = px.box(data_frame=df_distSc,y=df_distSc.columns.values,color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(title='Distribution Each Subject in Science',xaxis_title='')
fig.show()

![Distribution Each Subject in Humanities](photos/Distribution%20Each%20Subject%20in%20Humanities.png)

![Distribution Each Subject in Scence](photos/Distribution%20Each%20Subject%20in%20Science.png)

In [40]:
fig = px.bar(data_frame=df_univ_brawSc_clear.groupby('first_major').mean().reset_index().sort_values('average_score',ascending=False),x='average_score',y='first_major',color='first_major',height=1200,width=1200,text_auto=True,color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_layout(showlegend=False,title='Avg Score in Science Majors UB 2019',xaxis_title='Avg Score',yaxis_title='Majors',bargap=0.15,
bargroupgap=0.1)
fig.update_traces(textfont_size=16, textangle=0, textposition="inside", cliponaxis=True)
fig.show()

fig = px.bar(data_frame=df_univ_brawHm_clear.groupby('first_major').mean().reset_index().sort_values('average_score',ascending=False),x='average_score',y='first_major',color='first_major',height=1000,width=1000,text_auto=True,color_discrete_sequence=px.colors.qualitative.Set2)
fig.update_layout(showlegend=False,title='Avg Score in Humanities Majors UB 2019',xaxis_title='Avg Score',yaxis_title='Majors',bargap=0.15,
bargroupgap=0.1)
# fig.update_traces(textfont_size=16, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

![Avg Score in Science Majors UB 2019](/photos/Avg%20Score%20in%20Science%20Majors%20UB%202019.png)

![Avg Score in Humanities Majors UB 2019](photos/Avg%20Score%20in%20Humanities%20Majors%20UB%202019.png)

In [41]:
fig = px.bar(data_frame=df_univ_brawSc_clear.groupby('first_major').mean().sort_values('average_score',ascending=False).head(5).reset_index(),x=['average_score','specialized_score','general_score'],y='first_major',text_auto=True,barmode='group',height=500)

fig.update_layout(showlegend=True,title='Top 5 Avg Score Science Majors 2019',xaxis_title='Avg Scores',yaxis_title='Majors',bargap=0.15,
bargroupgap=0.1,yaxis={'categoryorder':'total ascending'})
fig.update_yaxes(tickangle=0, tickfont=dict(family='Rockwell', color='black', size=12))
fig.update_xaxes(tickangle=0, tickfont=dict(family='Rockwell', color='red', size=12))
fig.update_traces(textfont_size=11, textangle=0, textposition="outside", cliponaxis=True)
fig.show()

fig = px.bar(data_frame=df_univ_brawHm_clear.groupby('first_major').mean().sort_values('average_score',ascending=False).head(5).reset_index(),x=['average_score','specialized_score','general_score'],y='first_major',text_auto=True,barmode='group',height=500)

fig.update_layout(showlegend=True,title='Top 5 Avg Score Humanities Majors 2019',xaxis_title='Avg Scores',yaxis_title='Majors',bargap=0.15,
bargroupgap=0.1,yaxis={'categoryorder':'total ascending'})
fig.update_yaxes(tickangle=0, tickfont=dict(family='Rockwell', color='black', size=11))
fig.update_xaxes(tickangle=0, tickfont=dict(family='Rockwell', color='red', size=11))
fig.update_traces(textfont_size=11, textangle=0, textposition="outside", cliponaxis=True)
fig.show()

![Top 5 Avg Score Science Majors 2019](photos/Top%205%20Avg%20Score%20Science%20Majors%202019.png)

![Top 5 Avg Score Humanities Majors 2019](photos/Top%205%20Avg%20Score%20Humanities%20Majors%202019.png)

In [152]:
df_univ_braw_dokter = df_univ_brawSc_clear.loc[df_univ_brawSc_clear['first_major']=='PENDIDIKAN DOKTER'].sort_values(by=['average_score','specialized_score','general_score'],ascending=False)

df_univ_braw_akuntansi = df_univ_brawHm_clear.loc[df_univ_brawHm_clear['first_major']=='AKUNTANSI'].sort_values(by=['average_score','specialized_score','general_score'],ascending=False)

In [158]:
fig = make_subplots(rows=1, cols=2, shared_yaxes=True,subplot_titles=('Highest Scores','Lowest Scores'))

fig.add_trace(
    go.Bar(y=df_univ_braw_dokter.loc[:,'average_score',].iloc[:5],name='Highest Score',text=df_univ_braw_dokter.loc[:,'average_score'].iloc[:5],
            marker=dict(color=[1, 1, 1,1,1])),
    row=1, col=1
)

fig.add_trace(
    go.Bar(y=df_univ_braw_dokter.loc[:,'average_score'].iloc[-5:],name='Lowest Score',text=df_univ_braw_dokter.loc[:,'average_score'].iloc[-5:],
            marker=dict(color=['green', 'green', 'green','green','green'])),
    row=1, col=2,
)
fig.update_layout(title_text='Medical UB Avg Scores ',showlegend=False,)
fig.show()


fig = make_subplots(rows=1, cols=2, shared_yaxes=True,subplot_titles=('Highest Scores','Lowest Scores'))

fig.add_trace(
    go.Bar(y=df_univ_braw_akuntansi.loc[:,'average_score',].iloc[:5],name='Highest Score',text=df_univ_braw_akuntansi.loc[:,'average_score'].iloc[:5]),
    row=1, col=1
)

fig.add_trace(
    go.Bar(y=df_univ_braw_akuntansi.loc[:,'average_score'].iloc[-5:],name='Lowest Score',text=df_univ_braw_akuntansi.loc[:,'average_score'].iloc[-5:]),
    row=1, col=2,
)
fig.update_layout(title_text='Akuntansi UB Avg Scores',showlegend=False,)
fig.show()



# fig.update_xaxes(tickangle=0, tickfont=dict(family='Rockwell', color='black', size=14))

![Medical School UB Avg Scores 2019](/photos/Medical%20School%20UB%20Avg%20Scores%202019.png)

![Akuntansi UB Avg Scores](photos/Akuntansi%20UB%20Avg%20Scores.png)