In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import pickle
import boto3
import io

In [2]:
# export these datasets to s3 as csv files
s3_path = 's3://fake-job-posting-detection/Data'
train = pd.read_csv(f'{s3_path}/train.csv')
test = pd.read_csv(f'{s3_path}/test.csv')
validation = pd.read_csv(f'{s3_path}/validation.csv')

In [3]:
X_train = train['text']
y_train = train['fraudulent']
X_test = test['text']
y_test = test['fraudulent']

In [4]:
X_train

0        Government funding is only available for 16-1...
1       Are you looking for part time work and the cha...
2       About the Company:EventBoard (#URL_d92e5669220...
3       Someone once said “an optimist will tell you t...
4       PowerbyProxi and its customers are leading a r...
                              ...                        
5329    Qubit: Cutting Edge Big Data EngineeringQubit,...
5330    Dice is a highly specialized online marketing ...
5331    Avenue Story is currently looking for a self-m...
5332    Hayes Corp is looking for a Mobile Marketing M...
5333    About #URL_ff63a650f387cb722b8e7880655b3f1e996...
Name: text, Length: 5334, dtype: object

### Text Processing

#### TF-IDF

In [5]:
# Initialize TF-IDF Vectorizer
max_features = 5000
min_df=5
max_df=0.8
ngram_range=(1, 2)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', 
                                   max_features=max_features, 
                                   min_df=min_df, 
                                   max_df=max_df, 
                                   ngram_range=ngram_range)

# convert the training and testing data 
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train.values)
X_test_tfidf = tfidf_vectorizer.transform(X_test.values)



In [8]:
# train the model
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)

In [9]:
# Predict probabilities for the test set
y_pred = naive_bayes.predict(X_test_tfidf)

# Calculate AUC score
auc_score = roc_auc_score(y_test, y_pred)
print(f'AUC Score: {auc_score}')

AUC Score: 0.6137621932280478
