In [1]:
# importing some stuff
import os
import pandas as pd
import re
from langdetect import detect
import string
from unicodedata import category
import sys
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier

In [2]:
def det_lang(df):
    '''
    :param df: dataframe
    :return: dataframe with added language column
    '''
    def detect_language(text):
        try:
            lan = detect(text)
        except:
            lan = 'unknown'
        return lan

    df['language'] = df['text'].apply(detect_language)
    return df

In [3]:
def delete_not_english(df):
    '''
    :param df: dataframe
    :return: dataframe with removed rows containing not english texts
    '''
    df = det_lang(df)
    df = df[df['language'] == 'en']
    df = df.rename(columns={df.columns[0]: 'id'})
    df = df.drop(['id', 'language'], axis=1)
    return df

In [4]:
# df = pd.read_csv('original_data.csv')
# # remove not english news
# df = delete_not_english(df)
# df.to_csv('en_data.csv', encoding='utf-8')
df = pd.read_csv('en_data.csv')

In [5]:
def split_data(df):
    x = df[['title', 'text']]
    y = df['Ground Label']

    # splitting data into train and test sets (and then splitting train test into train and test for us)
    x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                        random_state=123,
                                                        test_size=0.3,
                                                        shuffle=True)
    x_train_train, x_train_test, y_train_train, y_train_test = train_test_split(x_train, y_train,
                                                                                random_state=123,
                                                                                test_size=0.3,
                                                                                shuffle=True)
    return x_train_train, x_train_test, x_test, y_train_train, y_train_test, y_test

In [6]:
# split data
x_train_train, x_train_test, x_test, y_train_train, y_train_test, y_test = split_data(df)

In [7]:
def create_directories():
    try:
        os.mkdir('validation_data')
        print('Directory validation_data created')
    except FileExistsError:
        print('Directory validation_data already exists')

    try:
        os.mkdir('train_data')
        print('Directory train_data created')
    except FileExistsError:
        print('Directory train_data already exists')

In [8]:
# create directories
create_directories()

Directory validation_data already exists
Directory train_data already exists


In [9]:
def save_to_files(x_train_train, x_train_test, x_test, y_train_train, y_train_test, y_test):
    x_test.to_csv('validation_data/x_test.csv', encoding='utf-8')
    y_test.to_csv('validation_data/y_test.csv', encoding='utf-8')
    print('validation data saved')
    x_train_train.to_csv('train_data/x_train_train.csv', encoding='utf-8')
    x_train_test.to_csv('train_data/x_train_test.csv', encoding='utf=8')
    y_train_train.to_csv('train_data/y_train_train.csv', encoding='utf-8')
    y_train_test.to_csv('train_data/y_train_test.csv', encoding='utf-8')
    print('test data saved')
    print('success! c:')

In [10]:
def change_y_label(df):
    df.loc[df['Ground Label'] == 'fake', 'Ground Label'] = 0
    df.loc[df['Ground Label'] == 'true', 'Ground Label'] = 1
    df['target'] = df['Ground Label']
    df = df['target'].to_frame()
    return df

In [11]:
def stemming(text):
    '''
    :param text: text to be stemmed
    :return: stemmed text
    '''
    words = word_tokenize(text)
    porter = PorterStemmer()
    stem_words = [porter.stem(word) for word in words]
    return ' '.join(stem_words)

In [12]:
def delete_stopwords(text):
    '''
    :param text: string
    :return: string with deleted stopwords
    '''
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [13]:
def clean_text(text, punctuation_chars):
    '''
    :param text: string
    :param punctuation_chars: string with all punctuaction characters to remove
    :return: processed text
    '''
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans('', '', ''.join(punctuation_chars)))
    # remove digits
    text = text.translate(str.maketrans('', '', string.digits))
    # remove all single characters
    pattern = r'(^| ).( |$)'
    text = re.sub(pattern, ' ', text)
    # remove multiple spaces
    text = re.sub(' +', ' ', text)
    # remove stopwords
    text = delete_stopwords(text)
    # stemming
    text = stemming(text)
    return text

In [14]:
def n_upper_chars(string):
    return sum(map(str.isupper, string))

def upper_ratio(string):
    if string == '':
        return 0
    else:
        return n_upper_chars(string)/len(string)

In [15]:
def clean_df(df):
    '''
    :param df: dataframe to process
    :return: preprocessed dataframe
    '''
    df['title'] = df['title'].fillna('')
    df['upper_ratio'] = df['title'].apply(lambda x: upper_ratio(x))
    
    df['full_text'] = df['title'] + ' ' + df['text']
    df = df[['full_text', 'upper_ratio','target']]

    df = df.dropna(subset=['full_text', 'target'])

    punctuation_chars = [chr(i) for i in range(sys.maxunicode)
                         if category(chr(i)).startswith("P")]

    df['full_text'] = df['full_text'].map(lambda x: clean_text(x, punctuation_chars))
    df = df[df['full_text'].str.len() >= 30]
    return df

In [16]:
# train_train data
y_train_train = y_train_train.to_frame()
y_train_train = change_y_label(y_train_train)
x_train_train = clean_df(x_train_train.assign(target=y_train_train))
y_train_train = x_train_train['target'].to_frame()
upper_ratio_train_train = x_train_train['upper_ratio'].to_frame()
x_train_train = x_train_train['full_text'].to_frame()

In [17]:
# train_test data
y_train_test = y_train_test.to_frame()
y_train_test = change_y_label(y_train_test)
x_train_test = clean_df(x_train_test.assign(target=y_train_test))
y_train_test = x_train_test['target'].to_frame()
upper_ratio_train_test = x_train_test['upper_ratio'].to_frame()
x_train_test = x_train_test['full_text'].to_frame()

In [18]:
# test data - for validation team
y_test = y_test.to_frame()
y_test = change_y_label(y_test)
x_test = clean_df(x_test.assign(target=y_test))
y_test = x_test['target'].to_frame()
upper_ratio_test = x_test['upper_ratio'].to_frame()
x_test = x_test['full_text'].to_frame()

In [19]:
save_to_files(x_train_train, x_train_test, x_test, y_train_train, y_train_test, y_test)
upper_ratio_train_train.to_csv('train_data/upper_ratio_train_train.csv', encoding='utf-8')
upper_ratio_train_test.to_csv('train_data/upper_ratio_train_test.csv', encoding='utf-8')
upper_ratio_test.to_csv('validation_data/upper_ratio_test.csv', encoding='utf-8')

validation data saved
test data saved
success! c:


In [32]:
x_train_train = pd.read_csv('train_data/x_train_train.csv')

In [33]:
vectorizer = TfidfVectorizer(max_df=0.7, min_df=0.01)
tfidf_train_train = vectorizer.fit_transform(x_train_train['full_text'])

In [None]:
tfidf_train_test = vectorizer.transform(x_train_test['full_text'])

In [21]:
tfidf_test = vectorizer.transform(x_test['full_text'])

In [34]:
tfidf_train_train = pd.DataFrame(tfidf_train_train.toarray(), columns=vectorizer.get_feature_names())
tfidf_train_train = tfidf_train_train.assign(upper_ratio=upper_ratio_train_train)
tfidf_train_train['upper_ratio'] = tfidf_train_train['upper_ratio'].fillna(0)
tfidf_train_train

Unnamed: 0,abandon,abc,abedin,abil,abl,abort,abroad,absolut,absurd,abus,...,youll,young,younger,your,youth,youtub,youv,zero,zone,upper_ratio
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.049180
3,0.0,0.0,0.0,0.0,0.026156,0.0,0.0,0.061686,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.079365
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.144578
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32983,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.144578
32984,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.04171,0.057971
32985,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000
32986,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.045476,0.0,0.0,0.00000,0.000000


In [23]:
tfidf_train_test = pd.DataFrame(tfidf_train_test.toarray(), columns=vectorizer.get_feature_names())
tfidf_train_test = tfidf_train_test.assign(upper_ratio=upper_ratio_train_test)
tfidf_train_test['upper_ratio'] = tfidf_train_test['upper_ratio'].fillna(0)
tfidf_train_test

Unnamed: 0,abandon,abc,abedin,abil,abl,abort,abroad,absolut,absurd,abus,...,youll,young,younger,your,youth,youtub,youv,zero,zone,upper_ratio
0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.227848
1,0.005745,0.0,0.0,0.004857,0.025854,0.0,0.002942,0.004690,0.000000,0.000000,...,0.003368,0.004478,0.0,0.007701,0.0,0.0,0.006461,0.005674,0.000000,0.000000
2,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.068774,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.063511,0.000000
3,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.057383,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14132,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
14133,0.000000,0.0,0.0,0.041124,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
14134,0.018199,0.0,0.0,0.000000,0.025202,0.0,0.018639,0.029718,0.042134,0.000000,...,0.021342,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
14135,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.040738,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.043900,0.029851


In [24]:
tfidf_test = pd.DataFrame(tfidf_test.toarray(), columns=vectorizer.get_feature_names())
tfidf_test = tfidf_test.assign(upper_ratio=upper_ratio_test)
tfidf_test['upper_ratio'] = tfidf_test['upper_ratio'].fillna(0)
tfidf_test

Unnamed: 0,abandon,abc,abedin,abil,abl,abort,abroad,absolut,absurd,abus,...,youll,young,younger,your,youth,youtub,youv,zero,zone,upper_ratio
0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000
1,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.426829
2,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.024827,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000
3,0.0,0.0,0.0,0.017975,0.014722,0.0,0.0,0.01736,0.0,0.000000,...,0.024933,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.040218,0.000000
4,0.0,0.0,0.0,0.000000,0.034638,0.0,0.0,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20183,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.337838
20184,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.046172,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000
20185,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000
20186,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.032698,0.0,0.036544,0.0,0.0,0.000000,0.000000


In [36]:
tfidf_train_train.to_csv('train_data/tfidf_train_train.csv', encoding='utf-8')
tfidf_train_test.to_csv('train_data/tfidf_train_test.csv', encoding='utf-8')
tfidf_test.to_csv('validation_data/tfidf_test.csv', encoding='utf-8')