In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from langdetect import DetectorFactory, detect
from langdetect.lang_detect_exception import LangDetectException
from bs4 import BeautifulSoup
import os
# Set the seed to enforce same results each time (non deterministic)
DetectorFactory.seed = 0

In [2]:
DATA_PATH = '../Data/sample_1mil_jobs.csv'
df = pd.read_csv(DATA_PATH)
df.drop(columns='Unnamed: 0', inplace=True)

In [3]:
def detect_language(x):
    try:
        lang = detect(x)
    except:
        lang = 'Other'
    return lang

# Cleaning

In [4]:
# Remove HTML caracters and expressions from strings
df.loc[:,'content_no_java'] = df['content'].apply(lambda text: BeautifulSoup(text, 'html.parser').get_text())

# Remove URLs
df.loc[:,'content_no_URL'] = df['content_no_java'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

# Remove '\r' and '\n'
df.loc[:,'content_no_URL'] = df.loc[:,'content_no_URL'].apply(lambda x: x.replace('\r',' ').replace('\n',' '))

# Language split

In [5]:
df['language'] = df['content_no_URL'].apply(detect_language)

In [6]:
# Separate into different dataframes
df_fr = df[df['language'] == 'fr']
df_de = df[df['language'] == 'de']
df_it = df[df['language'] == 'it']
df_en = df[df['language'] == 'en']


# Statitics
print('French ads :', df_fr.shape[0], 'corresponding to', 100*df_fr.shape[0]/df.shape[0], '%')
print('German ads :', df_de.shape[0], 'corresponding to', 100*df_de.shape[0]/df.shape[0], '%')
print('Italian ads :', df_it.shape[0], 'corresponding to', 100*df_it.shape[0]/df.shape[0], '%')
print('English ads :', df_en.shape[0], 'corresponding to', 100*df_en.shape[0]/df.shape[0], '%')


# Saving datasets in '.csv' files

os.mkdir('../Data/language_split')

df.to_csv('../Data/language_split/data_language.csv', index=False)
df_fr.to_csv('../Data/language_split/data_fr.csv', index=False)
df_de.to_csv('../Data/language_split/data_de.csv', index=False)
df_it.to_csv('../Data/language_split/data_it.csv', index=False)
df_en.to_csv('../Data/language_split/data_en.csv', index=False)

French ads : 149400 corresponding to 14.94 %
German ads : 808252 corresponding to 80.8252 %
Italian ads : 10113 corresponding to 1.0113 %
English ads : 31146 corresponding to 3.1146 %
