In [41]:
import argparse
import pandas as pd 
import numpy as np

from sklearn.linear_model import LogisticRegression

from __future__ import print_function
import sys

def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


In [2]:
data = pd.read_csv('../data/FN_Training_Set.csv', encoding='windows-1251')

In [3]:
data.head(1)

Unnamed: 0,fake_news_score,click_bait_score,Content Title,Content Url,Content Published Time,Content
0,1,1,Камил Хабиб: ДАЕШ и „Ан Нусра” са нов терорист...,http://a-specto.bg/kamil-habib-daesh-i-nusra-s...,17.5.2017 18:35,"Интервю на Десислава Пътева с Камил Хабиб, дек..."


In [4]:
# %load ../feature_extraction/feature_extract.py
import re
import pandas as pd
import argparse
from numpy import *
import io
 
train_data = data
train_data.fillna(value="", inplace=True)

with io.open('../feature_extraction/bg_stopwords.txt', encoding='utf8') as f:
    stopwords = [x.strip() for x in f.readlines()]
stopwords

def sentence2wordlist(raw, language='bg+en'):
    """language: 'bg+en', 'bg', 'en', 'symbol'"""
    if language == 'bg+en':
        regex = "[^а-яА-Яa-zA-Z]"
    elif language == 'bg':
        regex = "[^а-яА-Я]"
    elif language == 'en':
        regex = "[^a-zA-Z]"
    elif language == 'symbol':
        regex = "[^-!$%^&*()_+|~=`{}\[\]:\";'<>?,.\/]"
    elif language == '!':
        regex = "[^?!]"
    clean = re.sub(regex," ", raw)
    words = clean.split()
    return words

get_number_words = lambda sent: len(sentence2wordlist(sent))

get_number_char = lambda sent: len(sent)

get_number_symbols = lambda sent: len(sentence2wordlist(sent, 'symbol'))

def get_number_stopwrods(sent):
    wordlist = sentence2wordlist(sent)
    return array(list(map(lambda x: x in stopwords, wordlist))).sum()
    
def get_avg_char_per_word(sent):
    wordlist = sentence2wordlist(sent)
    return array(list(map(len, wordlist))).mean()

def get_avg_caps_per_char(sent):
    chars_re = "[^а-яА-Яa-zA-Z]"
    # remove white spaces as well as symbols
    clean = re.sub(chars_re,"", sent)
    caps_re = "[^А-ЯA-Z]"
    caps = re.sub(caps_re, "", clean)
    try:
        return len(caps)/len(clean)
    except:
        return -1 # div by 0 case

func_list = [get_number_words, 
             get_number_char,
             get_number_symbols,
             get_number_stopwrods,
             get_avg_char_per_word,
             get_avg_caps_per_char]


col_name_ph = ["{ph}_number_words", 
             "{ph}_number_char",
             "{ph}_number_symbols",
             "{ph}_number_stopwords",
             "{ph}_avg_char_per_word",
             "{ph}_avg_caps_per_char"]

col_name = lambda s: list(map(lambda x: x.format(ph=s), col_name_ph))

def append_new_columns(df, column, name_func_dict):
    for col, func in name_func_dict.items():
        df[col] = df[column].apply(func)
    return df

train_data_extra_features = append_new_columns(train_data, 
                                               "Content Title", 
                                               dict(zip(col_name('title'),func_list)))

train_data_extra_features = append_new_columns(train_data_extra_features, 
                                               "Content", 
                                               dict(zip(col_name('body'),func_list)))



In [20]:
train_data = train_data_extra_features

In [21]:
train_data['fake_news_score_binary'] =  train_data['fake_news_score'] == 3

In [29]:
train_data.head(1)

Unnamed: 0,fake_news_score,click_bait_score,Content Title,Content Url,Content Published Time,Content,title_number_char,title_avg_char_per_word,title_number_stopwords,title_number_symbols,title_number_words,title_avg_caps_per_char,body_number_char,body_number_symbols,body_avg_caps_per_char,body_avg_char_per_word,body_number_words,body_number_stopwords,fake_news_score_binary
0,1,1,Камил Хабиб: ДАЕШ и „Ан Нусра” са нов терорист...,http://a-specto.bg/kamil-habib-daesh-i-nusra-s...,17.5.2017 18:35,"Интервю на Десислава Пътева с Камил Хабиб, дек...",58,,0.0,1,0,-1,7345,194,0,3.0,3,0.0,False


In [69]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data, train_data["fake_news_score_binary"], \
                                                    test_size=0.25, random_state=0)
X_train.reset_index(drop=True, inplace=True);
X_test.reset_index(drop=True, inplace=True);
y_train.reset_index(drop=True, inplace=True);
y_test.reset_index(drop=True, inplace=True);

In [88]:
def to_words( text ):
    letters_only = re.sub(u"[^A-Яа-яa-zA-Z]", " ", text) 
    words = letters_only.lower().split()  
    stops = set(stopwords) # loaded above; lookup in set is faster than in list                
    meaningful_words = [w for w in words if not w in stops]   
    return( " ".join( meaningful_words ))   

def hist(train_data_features, vocab):
    dist = np.sum(train_data_features, axis=0)
    for tag, count in zip(vocab, dist):
        print (count, tag)

def create_features(train_data):
    clean_train_words = []
    num_rows = train_data.shape[0] 

    print (train_data.shape)
    content = train_data['Content']
    
    for i in xrange(0, num_rows):
        if( (i+1)%1000 == 0 ):
            print ("%d / %d\n" % ( i+1, num_rows ) )      
        clean_train_words.append(to_words(content[i]))
    
    from sklearn.feature_extraction.text import CountVectorizer

    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.  
    vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 5000) 
    train_data_features = vectorizer.fit_transform(clean_train_words)
    train_data_features = train_data_features.toarray()
    print(train_data_features.shape)
    vocab = vectorizer.get_feature_names()
    hist(train_data_features, vocab)
    return train_data_features

In [89]:
X_train_features = create_features(X_train)

(2111, 19)
1000 / 2111

2000 / 2111

(2111, 5000)
39 advertising
75 amp
93 and
187 aтa
142 aтo
36 bank
167 baнгa
55 bd
80 be
68 bezistena
640 bg
49 blitz
77 blog
70 bradva
43 bulgaria
34 by
66 ca
57 caмo
848 ce
130 cfr
52 chemtrails
34 co
31 collins
322 com
31 comments
36 cyтpин
82 cи
38 cин
134 cлeд
35 cъc
40 daily
52 deep
34 deste
43 economist
49 eднa
156 facebook
42 google
119 gt
35 he
66 history
62 html
162 http
38 https
97 ii
46 iii
79 in
51 info
40 is
41 km
65 na
50 net
56 new
98 news
164 of
36 on
56 oбaчe
48 oгaтo
44 oитo
52 oйтo
37 oл
32 oлo
406 oт
38 oятo
60 pa
34 paз
125 petel
39 rates
46 rothschild
50 ru
107 skafeto
59 state
415 the
46 times
58 to
42 tя
63 us
35 world
90 www
41 xx
61 youtube
32 zajenata
33 абе
32 абсолютна
239 абсолютно
47 абсурдно
45 аварията
162 август
54 авиолинии
78 австралия
35 австрия
43 автомобили
200 автор
48 автора
47 автори
56 авторите
64 авторитет
39 авторката
71 авторът
64 агент
105 агенти
36 агентите
35 агенции
123 агенция
96 агресия
39 агресият

In [90]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit( X_train_features, X_train["fake_news_score_binary"] )

In [None]:
X_test_features  = create_features(X_test)

In [105]:
from sklearn.cross_validation import cross_val_score
np.mean(cross_val_score(forest, X_test_features, y_test, cv=30))

0.69657971014492759