In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv('../input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv', index_col='job_id')
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.duplicated().value_counts()

In [None]:
df[df.fraudulent==0].duplicated().value_counts(normalize=True)

In [None]:
df[df.fraudulent==1].duplicated().value_counts(normalize=True)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.fraudulent.value_counts()

In [None]:
df.fraudulent.value_counts(normalize=True)

In [None]:
df.describe(include='all')

In [None]:
df.isna().sum()

In [None]:
df_na=df.fillna('na') 

In [None]:
real=df_na[df_na.fraudulent==0]
real.describe(include='all')

In [None]:
fake=df_na[df_na.fraudulent==1]
fake.describe(include='all')

In [None]:
fake.sample(10)

In [None]:
fake.loc[17512,:]

In [None]:
fake.loc[3609, :]

In [None]:
fake.loc[7655, :]

In [None]:
fake.loc[9843, :]

In [None]:
fake.loc[181,:]

In [None]:
fake.loc[fake.description=='na']

In [None]:
fake.loc[11543, 'description']

In [None]:
na_rates=pd.DataFrame([col, len(real.loc[real[col]=='na'])/len(real[col]), len(fake.loc[fake[col]=='na'])/len(fake[col]) ] for col in df.columns)

In [None]:
na_rates.columns=['column','real_na_rates','fake_na_rates']

In [None]:
na_rates

In [None]:
from statsmodels.stats.proportion import proportions_ztest
for col in df.columns:
    counts=np.array([len(real.loc[real[col]=='na']), len(fake.loc[fake[col]=='na'])])
    nobs=np.array([len(real[col]), len(fake[col])])
    if (counts.sum() !=0) and (nobs.sum() !=0):
        na_rates.loc[na_rates.column==col,'zstat'], na_rates.loc[na_rates.column==col,'p_value']=proportions_ztest(count=counts, nobs=nobs,  alternative='two-sided')

In [None]:
na_rates['significant_diff']=na_rates['p_value']<0.005

In [None]:
na_rates

In [None]:
df_na['company_profile_length']=df_na.apply(lambda row: len(row.company_profile), axis=1)
df_na['description_length']=df_na.apply(lambda row: len(row.description), axis=1)
df_na['requirements_length']=df_na.apply(lambda row: len(row.requirements), axis=1)
df_na['benefits_length']=df_na.apply(lambda row: len(row.benefits), axis=1)
df_na['total_text_length']=df_na['company_profile_length']+df_na['description_length']+df_na['requirements_length']+df_na['benefits_length']

In [None]:
df_na.head()

In [None]:
sns.displot(df_na, x='company_profile_length', hue='fraudulent', stat='density', bins=20, common_norm=False, multiple='dodge')

In [None]:
ax=sns.displot(df_na, x='company_profile_length', hue='fraudulent', kind='kde', common_norm=False, cut=0)
ax.set(xscale="log")

In [None]:
sns.displot(df_na, x='description_length', hue='fraudulent', stat='density', bins=20, common_norm=False, multiple='dodge')

In [None]:
ax=sns.displot(df_na, x='description_length', hue='fraudulent', kind='kde', common_norm=False, cut=0)
ax.set(xscale="log")

In [None]:
ax=sns.displot(df_na, x='requirements_length', hue='fraudulent', kind='kde', common_norm=False, cut=0)
ax.set(xscale="log")

In [None]:
ax=sns.displot(df_na, x='benefits_length', hue='fraudulent', kind='kde', common_norm=False, cut=0)
ax.set(xscale="log")

In [None]:
ax=sns.displot(df_na, x='total_text_length', hue='fraudulent', kind='kde', common_norm=False, cut=0)
ax.set(xscale="log")

In [None]:
sns.catplot(data=df_na, y='fraudulent', x='telecommuting', kind='bar')

In [None]:
sns.catplot(data=df_na, y='fraudulent', x='has_company_logo', kind='bar')

In [None]:
sns.catplot(data=df_na, y='fraudulent', x='has_questions', kind='bar')

In [None]:
sns.catplot(data=df_na, x='fraudulent', y='employment_type', kind='bar')

In [None]:
sns.catplot(data=df_na, x='fraudulent', y='required_experience', kind='bar')

In [None]:
sns.catplot(data=df_na, x='fraudulent', y='required_education', kind='bar')

In [None]:
sns.catplot(data=df_na, x='fraudulent', y='function', kind='bar')

In [None]:
df_na.industry.value_counts()[:20]

In [None]:
fake.industry.value_counts()[:20]

In [None]:
real.industry.value_counts()[:20]

In [None]:
industry=pd.crosstab(df_na.industry, df_na.fraudulent, normalize='index', margins=True)
industry.iloc[:,1].sort_values(ascending=False)[:20].to_frame(name='fraud rates')

In [None]:
df_na.title.value_counts()[:20]

In [None]:
real.title.value_counts()[:20]

In [None]:
fake.title.value_counts()[:20]

In [None]:
fake.title.str.contains('$', regex=False).value_counts(normalize=True)

In [None]:
real.title.str.contains('$', regex=False).value_counts(normalize=True)

In [None]:
df_na.location.value_counts()[:20]

In [None]:
real.location.value_counts()[:20]

In [None]:
fake.location.value_counts()[:20]

In [None]:
df.location=df.location.str[:2]

In [None]:
df.location.value_counts()

In [None]:
real.location=real.location.str[:2]
real.location.value_counts(normalize=True).head(10)

In [None]:
fake.location=fake.location.str[:2]
fake.location.value_counts(normalize=True).head(10)

In [None]:
df.salary_range.value_counts()

In [None]:
real.salary_range.value_counts(normalize=True)

In [None]:
fake.salary_range.value_counts(normalize=True)

In [None]:
real_text=real.title+' '+real.company_profile+' '+real.description+' '+real.requirements+' '+real.benefits

In [None]:
real_text_frame=real_text.to_frame(name='text')

In [None]:
fake_text=fake.title+' '+fake.company_profile+' '+fake.description+' '+fake.requirements+' '+fake.benefits

In [None]:
fake_text_frame=fake_text.to_frame(name='text')

In [None]:
fake_text.str.contains('#URL', regex=False).value_counts(normalize=True)

In [None]:
real_text.str.contains('#URL', regex=False).value_counts(normalize=True)

In [None]:
fake_text.str.contains('#EMAIL', regex=False).value_counts(normalize=True)

In [None]:
real_text.str.contains('#EMAIL', regex=False).value_counts(normalize=True)

In [None]:
fake_text.str.contains('#PHONE', regex=False).value_counts(normalize=True)

In [None]:
real_text.str.contains('#PHONE', regex=False).value_counts(normalize=True)

In [None]:
fake_text[9835]

In [None]:
!pip install word2number

In [None]:
!pip install contractions

In [None]:
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
import contractions

nlp = spacy.load("en_core_web_sm")

deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False


def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())


def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text


def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = contractions.fix(text)
    return text


def text_preprocessing(text, accented_chars=True, contractions=True, 
                       convert_num=True, extra_whitespace=True, 
                       lemmatization=True, lowercase=True, punctuations=True,
                       remove_html=True, remove_num=True, special_chars=True, 
                       stop_words=True):
    """preprocess text with default option set to true for all steps"""
    if remove_html == True: 
        text = strip_html_tags(text)
    if extra_whitespace == True: 
        text = remove_whitespace(text)
    if accented_chars == True: 
        text = remove_accented_chars(text)
    if contractions == True: 
        text = expand_contractions(text)
    if lowercase == True: 
        text = text.lower()

    doc = nlp(text) 

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        if special_chars == True and token.pos_ == 'SYM' and flag == True: 
            flag = False
        if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        if convert_num == True and token.pos_ == 'NUM' and flag == True:
            edit = w2n.word_to_num(token.text)
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return clean_text