In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#!pip install missingno
import missingno as msno

#!pip install plotly
#!pip install -U kaleido
import plotly.express as px
pd.options.plotting.backend = "plotly"

In [None]:
df = pd.read_csv('datasets/theses_v2.csv', encoding='latin-1', low_memory=False)

# Missing Data

In [None]:
msno.matrix(df)
plt.savefig("plots/missing_plot.png")

Create number of pages column

In [None]:
from scipy.stats import bernoulli as bn

mu, sigma = 200, 50
pages = sigma * np.random.randn(1, df.shape[0]) + mu
flag = bn.rvs(p = 0.8, size = (1, df.shape[0]))
pages[flag == 0] = np.nan

df['nb_pages'] = pages.ravel()

Dealing with missing data using imputation technique

In [None]:
df.dropna(subset=['Date de soutenance'], inplace=True)
df['Date de soutenance'] = pd.DatetimeIndex(df['Date de soutenance'])
years = df.groupby('Year').count().reset_index().reindex(['Year', 'Titre'], axis=1).set_index('Year')

# New Year's Day

In [None]:
df_0101 = df[df['Date de soutenance'].apply(lambda x: np.logical_and(x.day == 1, x.month == 1))]
df_0101['Year'] = df_0101['Date de soutenance'].apply(lambda x: x.year)
df_0101 = df_0101.groupby('Year').count().reset_index().reindex(['Year', 'Titre'], axis=1)
df_0101.rename(columns={'Titre' : 'nb_Thesis'}, inplace=True)
df_0101['nb_Thesis_byyear'] = df_0101['Year'].apply(lambda x: years.loc[x])
df_0101['Percentage'] = df_0101['nb_Thesis'] / df_0101['nb_Thesis_byyear'] * 100

fig = df_0101.plot(x='Year', y='Percentage', title='Percentage of thesis defended on New Year\'s Day')



In [None]:
df = df[df['Date de soutenance'].apply(lambda x: np.logical_and(x.year > 2010, np.logical_or(x.day != 1, x.month != 1)))]
df['Month'] = df['Date de soutenance'].apply(lambda x: x.month)
df['Year'] = df['Date de soutenance'].apply(lambda x: x.year)

In [None]:
df_months = df.groupby(['Year', 'Month']).count().reset_index().reindex(['Year', 'Month', 'Titre'], axis=1)
df_months.rename(columns={'Titre' : 'nb_Thesis'}, inplace=True)
df_months['nb_Thesis_byyear'] = df_months['Year'].apply(lambda x: years.loc[x])
df_months['Percentage'] = df_months['nb_Thesis'] / df_months['nb_Thesis_byyear'] * 100
df_months['Time'] = pd.to_datetime(df_months[['Year', 'Month']].assign(day=1))

In [None]:
fig = df_months.plot(x='Time', y='Percentage', title='Percentage of thesis defended by month')
fig.write_image("plots/thesis_by_month.png")
fig

In [None]:
fig = df_months.groupby('Month').mean().reset_index().plot.bar(x='Month', y='Percentage', title='Percentage of thesis defended during the year')
fig.write_image("plots/percentage_thesis_during_year.png")
fig

# Languages

Create gender column using gender-guesser library

In [None]:
#!pip install gender-guesser
import gender_guesser.detector as gender
import re

df[['first_name','last_name']] = df['Auteur'].apply(lambda s: re.sub('\([^\)]+\)', '', s).rstrip()).str.split(' ', 1, expand=True)
detector = gender.Detector(case_sensitive=False)
df['Gender'] = df['first_name'].apply(lambda s: detector.get_gender(s))
rename_dict = {'mostly_female': 'Female', 'mostly_male': 'Male', 'male' : 'Male', 'female' : 'Female'}
df['Gender'].replace(rename_dict, inplace=True)
df.drop(df[np.logical_or(df['Gender'] == 'unknown', df['Gender'] == 'andy')].index, inplace=True)

In [None]:
df_gender = df.groupby('Gender').count().reset_index().rename(columns={'Auteur' : 'nb_Thesis'})
fig = px.pie(df_gender, values='nb_Thesis', names='Gender')
fig.write_image("plots/gender_pie.png")
fig