## 0.Imports

In [1]:
import pickle
import pandas as pd
import numpy as np

## 1. Helper functions

In [2]:
def read_pickle(path):
    objects = []
    with (open(path, "rb")) as openfile:
        while True:
            try:
                objects.append(pickle.load(openfile))
            except EOFError:
                break
    return objects

def get_features_targets(path_features,path_targets):
    features = read_pickle(path_features)
    targets = read_pickle(path_targets)
    return (features[0],targets[0])

def remove_nan_inf(array, change_value=0):
    if np.isnan(array).any():
        pos = np.isnan(array)
        array[pos] = change_value
    if np.isinf(array).any():
        pos = np.isinf(array)
        array[pos] = change_value
    return array

def check_nan_inf(array): 
    return (np.isnan(array).any() or np.isinf(array).any())

def get_shit(loc_features, loc_targets, name):
    features_, targets_ = get_features_targets(loc_features, loc_targets)
    print( name+'...')
    print('Nan Before', check_nan_inf(features_))
    features_ = remove_nan_inf(features_)
    print('Nan After', check_nan_inf(features_))
    print(name + 'Shape:', features_.shape)
    return features_, targets_['target']

In [29]:
def get_features(arr):
    dic = {}
    # Tf-idf normalized cosine similarity
    tf_idf_c = arr[:,41]
    print('Tf_idf shape:',tf_idf_c.shape)
    dic['tf_idf_c'] = tf_idf_c
    
    # Headline body similarity
    head_body_sim = arr[:,142]
    dic['Head_Body_Sim'] = head_body_sim
    
    # Headline body sentiments
    head_senti = arr[:, 143:147]
    body_senti = arr[:, 147:151]
    print('Headline sentiments shape:{}\n Body sentiments shape: {}\n'\
          .format(head_senti.shape, body_senti.shape))
    
    dic['Head_compound'] = head_senti[:,0]
    dic['Head_neg'] = head_senti[:,1]
    dic['Head_neu'] = head_senti[:,2]
    dic['Head_pos'] = head_senti[:,3]
    
    dic['Body_compound'] = body_senti[:,0]
    dic['Body_neg'] = body_senti[:,1]
    dic['Body_neu'] = body_senti[:,2]
    dic['Body_pos'] = body_senti[:,3]
    
    # Readibility features
    read_f = arr[:,151:162]
    print('Readibility features shape:',read_f.shape)
    keys = ['flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade',
        'coleman_liau_index', 'automated_readability_index', 
        'dale_chall_readability_score', 'difficult_words', 'linsear_write_formula',
        'gunning_fog', 'i_me_myself', 'punct']
    t_dic = {keys[i]:read_f[:,i] for i in range(len(keys))}
    dic.update(t_dic)
    df = pd.DataFrame(dic)
    
    return df

# For count and other features

def get_count_wordvec(arr):
    c_f = arr[:, :41]
    print('Count features shape', c_f.shape)
    word_vec_head = arr[:,42:92]
    word_vec_body = arr[:,92:142]
    print('Word_vec shape')
    print('Head:\t{} Body:\t{}'.format(word_vec_head.shape,word_vec_body.shape))
    return c_f, word_vec_head, word_vec_body

### Mapping

In [6]:
features_scrapped,targets_scrapped = get_features_targets\
('datasets/generated_feature.pkl','datasets/generated_feature_targets.pkl')
X_s = features_scrapped[:2232]
y_s = targets_scrapped['target'][:2232]
print(check_nan_inf(X_s))
X_k =remove_nan_inf(X_s)
print(check_nan_inf(X_s))
print(X_s.shape)

False
False
(2232, 163)


In [7]:
# Count features
cf = X_s[:,:41]
print('====='*5+'Count Features'+'====='*5)
print('Count features shape:',cf.shape)
print('First element')
print(cf[0])

# Tf-idf normalized cosine similarity
print('====='*5+'Tf-idf similarity'+'====='*5)
tf_idf_c = X_s[:,41]
print('Shape:',tf_idf_c.shape)
print('First element\n',tf_idf_c[:10])

# Word to vec
print('====='*5+'Word to vec'+'====='*5)
word_vec_head = X_s[:,42:92]
word_vec_body = X_s[:,92:142]
head_body_sim = X_s[:,142]
print('Head', 'Body', 'Similarity', sep = '\t\t')
print(word_vec_head.shape,word_vec_body.shape, head_body_sim.shape,sep = '\t'); print()
print(word_vec_body[1], head_body_sim[:5],sep ='\n')

# Word to vec
print('====='*5+'Head body sentiments'+'====='*5)

head_senti = X_s[:, 143:147]
body_senti = X_s[:, 147:151]
print('Shape')
print(head_senti.shape, body_senti.shape)
print(head_senti[0], body_senti[0], sep = '\n')

# Readibility features
print('====='*5+'Readibilty features'+'====='*5)
read_f = X_s[:,151:]
print(read_f.shape)
print(read_f[0])






Count features shape: (2232, 41)
First element
[  7.           7.           1.           6.           6.
   1.           5.           5.           1.         312.
 223.           0.71474359 311.         303.           0.97427653
 310.         310.           1.           5.           0.71428571
   0.           0.           0.           0.           1.
  29.           0.           0.           0.           0.
   0.           0.           0.           0.           0.
   0.           0.           0.           0.           0.
   0.        ]
Shape: (2232,)
First element
 [0.11833917 0.00697112 0.07370369 0.26053153 0.29439177 0.22756415
 0.14593261 0.02041761 0.07684427 0.09184856]
Head		Body		Similarity
(2232, 50)	(2232, 50)	(2232,)

[ 8.20541092e-02  1.57316458e-02  3.32752287e-03 -2.73157156e-02
  6.92358593e-02  1.54418886e-02 -9.51810098e-02 -2.11282552e-02
 -3.34680330e-02 -1.48440022e-02  1.60808232e-02 -1.02009090e-02
 -8.51492036e-02 -3.51991562e-02  1.20794560e-01  6.94927995e-02
 

## Prep

In [10]:
tempd= get_features(X_s)
print(tempd.keys())
print('======'*5+'Tf_idf similarity'+'====='*5)
print(tempd['tf_idf_c'][:10], tf_idf_c[:10],sep = '\n')

print('======'*5+'Head Body similarity'+'====='*5)
print(tempd['Head_Body_Sim'][:10], head_body_sim[:10],sep = '\n')

print('======'*5+'Sentiments'+'====='*5)
print(tempd['Head_compound'][:10], tempd['Head_neg'][:10], \
tempd['Head_neu'][:10], tempd['Head_pos'][:10], head_senti[:10],sep = '\n')

print('======'*5+'Article body'+'====='*5)

print(tempd['Body_compound'][:10], tempd['Body_neg'][:10], \
tempd['Body_neu'][:10], tempd['Body_pos'][:10], body_senti[:10],sep = '\n')

print('======'*5+'Reading features'+'====='*5)

print(tempd['flesch_reading_ease'][:10], tempd['smog_index'][:10], tempd['flesch_kincaid_grade'][:10] \
, read_f[:,:3][:10],sep = '\n')

print()

print(tempd['gunning_fog'][:10], tempd['i_me_myself'][:10], tempd['punct'][:10] \
, read_f[:,-3:][:10],sep = '\n')

t = pd.DataFrame(tempd)
t.head()

Tf_idf shape: (2232,)
Headline sentiments shape:(2232, 4)
 Body sentiments shape: (2232, 4)

Readibility features shape: (2232, 11)
Index(['tf_idf_c', 'Head_Body_Sim', 'Head_compound', 'Head_neg', 'Head_neu',
       'Head_pos', 'Body_compound', 'Body_neg', 'Body_neu', 'Body_pos',
       'flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade',
       'coleman_liau_index', 'automated_readability_index',
       'dale_chall_readability_score', 'difficult_words',
       'linsear_write_formula', 'gunning_fog', 'i_me_myself', 'punct'],
      dtype='object')
0    0.118339
1    0.006971
2    0.073704
3    0.260532
4    0.294392
5    0.227564
6    0.145933
7    0.020418
8    0.076844
9    0.091849
Name: tf_idf_c, dtype: float64
[0.11833917 0.00697112 0.07370369 0.26053153 0.29439177 0.22756415
 0.14593261 0.02041761 0.07684427 0.09184856]
0    0.804282
1    0.971542
2    0.968892
3    0.914280
4    0.930432
5    0.983159
6    0.863771
7    0.895777
8    0.961186
9    0.829358
Name: Head_Body

Unnamed: 0,tf_idf_c,Head_Body_Sim,Head_compound,Head_neg,Head_neu,Head_pos,Body_compound,Body_neg,Body_neu,Body_pos,...,smog_index,flesch_kincaid_grade,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,i_me_myself,punct
0,0.118339,0.804282,-0.8238,0.58,0.304,0.116,-0.119276,0.117241,0.806,0.076724,...,11.1,8.3,13.34,11.6,7.75,107.0,8.285714,16.79,3.0,51.0
1,0.006971,0.971542,-0.4215,0.201,0.714,0.084,0.447,0.047333,0.826,0.126667,...,17.1,19.9,12.32,24.7,8.67,48.0,21.0,27.29,0.0,20.0
2,0.073704,0.968892,0.0,0.0,1.0,0.0,0.045808,0.044583,0.6855,0.269833,...,13.2,14.2,11.5,16.9,8.41,57.0,16.25,22.56,0.0,28.0
3,0.260532,0.91428,0.0,0.0,1.0,0.0,-0.030663,0.097875,0.7485,0.15375,...,13.2,12.3,13.87,15.2,8.62,45.0,15.75,20.97,0.0,17.0
4,0.294392,0.930432,-0.5994,0.438,0.562,0.0,-0.159617,0.121278,0.807167,0.071556,...,11.7,9.6,13.28,11.7,9.09,81.0,8.571429,20.26,1.0,34.0


## DUMP

In [18]:
opensources_path = ['datasets/generated_feature.pkl', 'datasets/generated_feature_targets.pkl']

kaggle_path = ['datasets/generated_feature_kaggle.pkl', 'datasets/targets_kaggle.pkl']

d3_path = ['datasets/generated_feature_new.pkl', 'datasets/targets_new.pkl']
tdic = {0:'REAL', 1:'FAKE'}

In [21]:
# Opensource dataset dump
open_f, open_t = get_shit(opensources_path[0], opensources_path[1], 'Opensources')
print('Class\tCounts',open_t.value_counts(), sep = '\n')
# Making dataframes
open_df = get_features(open_f)

open_df['target'] = open_t

open_df['dataset'] = 'Dataset - 1'
open_df['target'] = open_df['target'].apply(lambda x: tdic[x])
open_df.head()

Opensources...
Nan Before False
Nan After False
OpensourcesShape: (11161, 163)
Class	Counts
0    5776
1    5385
Name: target, dtype: int64
Tf_idf shape: (11161,)
Headline sentiments shape:(11161, 4)
 Body sentiments shape: (11161, 4)

Readibility features shape: (11161, 11)


Unnamed: 0,tf_idf_c,Head_Body_Sim,Head_compound,Head_neg,Head_neu,Head_pos,Body_compound,Body_neg,Body_neu,Body_pos,...,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,i_me_myself,punct,target,dataset
0,0.118339,0.804282,-0.8238,0.58,0.304,0.116,-0.119276,0.117241,0.806,0.076724,...,13.34,11.6,7.75,107.0,8.285714,16.79,3.0,51.0,REAL,Dataset - 1
1,0.006971,0.971542,-0.4215,0.201,0.714,0.084,0.447,0.047333,0.826,0.126667,...,12.32,24.7,8.67,48.0,21.0,27.29,0.0,20.0,FAKE,Dataset - 1
2,0.073704,0.968892,0.0,0.0,1.0,0.0,0.045808,0.044583,0.6855,0.269833,...,11.5,16.9,8.41,57.0,16.25,22.56,0.0,28.0,FAKE,Dataset - 1
3,0.260532,0.91428,0.0,0.0,1.0,0.0,-0.030663,0.097875,0.7485,0.15375,...,13.87,15.2,8.62,45.0,15.75,20.97,0.0,17.0,FAKE,Dataset - 1
4,0.294392,0.930432,-0.5994,0.438,0.562,0.0,-0.159617,0.121278,0.807167,0.071556,...,13.28,11.7,9.09,81.0,8.571429,20.26,1.0,34.0,FAKE,Dataset - 1


In [24]:
# kaggle dataset dump.
 
kaggle_f, kaggle_t = get_shit(kaggle_path[0], kaggle_path[1], 'Kaggle')
print('Class\tCounts', kaggle_t.value_counts(), sep = '\n')

kaggle_df = get_features(kaggle_f)
kaggle_df['target'] = kaggle_t.map(lambda x:tdic[x])
kaggle_df['dataset'] = 'Dataset - 2'
kaggle_df.head()

Kaggle...
Nan Before True
Nan After False
KaggleShape: (20800, 163)
Class	Counts
1    10413
0    10387
Name: target, dtype: int64
Tf_idf shape: (20800,)
Headline sentiments shape:(20800, 4)
 Body sentiments shape: (20800, 4)

Readibility features shape: (20800, 11)


Unnamed: 0,tf_idf_c,Head_Body_Sim,Head_compound,Head_neg,Head_neu,Head_pos,Body_compound,Body_neg,Body_neu,Body_pos,...,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,i_me_myself,punct,target,dataset
0,0.356702,0.848848,0.0,0.0,1.0,0.0,-0.002041,0.037054,0.920405,0.042568,...,12.25,15.9,7.77,144.0,20.0,19.94,2.0,66.0,FAKE,Dataset - 2
1,0.114182,0.858105,0.0,0.0,1.0,0.0,0.019269,0.083345,0.834759,0.081862,...,11.38,19.1,8.29,130.0,15.25,23.52,7.0,66.0,REAL,Dataset - 2
2,0.072524,0.921988,-0.3182,0.33,0.459,0.211,0.076524,0.083588,0.80551,0.11098,...,12.83,22.3,8.43,227.0,10.833333,24.99,1.0,130.0,FAKE,Dataset - 2
3,0.361706,0.947477,-0.6705,0.333,0.667,0.0,-0.461196,0.191074,0.792741,0.016185,...,11.51,22.7,8.17,85.0,18.666667,25.22,1.0,36.0,FAKE,Dataset - 2
4,0.268379,0.943439,-0.7964,0.372,0.628,0.0,-0.39736,0.1228,0.8518,0.0254,...,13.66,79.2,14.53,32.0,30.0,71.57,0.0,15.0,FAKE,Dataset - 2


In [25]:
# dataset 3
d3_f, d3_t = get_shit(d3_path[0], d3_path[1], 'D -- 3')
print('Class\tCounts', d3_t.value_counts(), sep = '\n')

d3_df = get_features(d3_f)
d3_df['target'] = d3_t.map(lambda x:tdic[x])
d3_df['dataset'] = 'Dataset - 3'
d3_df.head()

D -- 3...
Nan Before True
Nan After False
D -- 3Shape: (6335, 163)
Class	Counts
0    3171
1    3164
Name: target, dtype: int64
Tf_idf shape: (6335,)
Headline sentiments shape:(6335, 4)
 Body sentiments shape: (6335, 4)

Readibility features shape: (6335, 11)


Unnamed: 0,tf_idf_c,Head_Body_Sim,Head_compound,Head_neg,Head_neu,Head_pos,Body_compound,Body_neg,Body_neu,Body_pos,...,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,i_me_myself,punct,target,dataset
0,0.109181,0.822768,-0.4939,0.444,0.556,0.0,-0.210469,0.162989,0.779333,0.057678,...,10.86,12.7,7.38,213.0,8.833333,17.81,0.0,104.0,FAKE,Dataset - 3
1,0.150184,0.940949,-0.5267,0.256,0.625,0.119,0.036058,0.063,0.8365,0.062038,...,11.78,13.6,7.89,87.0,8.571429,19.2,0.0,43.0,FAKE,Dataset - 3
2,0.129203,0.958502,0.3612,0.0,0.762,0.238,0.031881,0.047563,0.883937,0.0685,...,12.14,23.3,8.71,80.0,10.333333,26.66,2.0,35.0,REAL,Dataset - 3
3,0.049776,0.920681,-0.3595,0.266,0.591,0.143,-0.095876,0.084765,0.867471,0.047706,...,15.73,23.3,9.25,97.0,33.0,26.28,0.0,43.0,FAKE,Dataset - 3
4,0.081292,0.949463,-0.3612,0.243,0.654,0.103,0.220305,0.055952,0.795905,0.14819,...,11.38,19.0,7.67,46.0,10.333333,21.88,1.0,38.0,REAL,Dataset - 3


In [26]:
# Combining Dataframe
f_combined = pd.concat([open_df, kaggle_df, d3_df], ignore_index = True)
f_combined.head()

Unnamed: 0,tf_idf_c,Head_Body_Sim,Head_compound,Head_neg,Head_neu,Head_pos,Body_compound,Body_neg,Body_neu,Body_pos,...,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,i_me_myself,punct,target,dataset
0,0.118339,0.804282,-0.8238,0.58,0.304,0.116,-0.119276,0.117241,0.806,0.076724,...,13.34,11.6,7.75,107.0,8.285714,16.79,3.0,51.0,REAL,Dataset - 1
1,0.006971,0.971542,-0.4215,0.201,0.714,0.084,0.447,0.047333,0.826,0.126667,...,12.32,24.7,8.67,48.0,21.0,27.29,0.0,20.0,FAKE,Dataset - 1
2,0.073704,0.968892,0.0,0.0,1.0,0.0,0.045808,0.044583,0.6855,0.269833,...,11.5,16.9,8.41,57.0,16.25,22.56,0.0,28.0,FAKE,Dataset - 1
3,0.260532,0.91428,0.0,0.0,1.0,0.0,-0.030663,0.097875,0.7485,0.15375,...,13.87,15.2,8.62,45.0,15.75,20.97,0.0,17.0,FAKE,Dataset - 1
4,0.294392,0.930432,-0.5994,0.438,0.562,0.0,-0.159617,0.121278,0.807167,0.071556,...,13.28,11.7,9.09,81.0,8.571429,20.26,1.0,34.0,FAKE,Dataset - 1


In [27]:
f_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38296 entries, 0 to 38295
Data columns (total 23 columns):
tf_idf_c                        38296 non-null float64
Head_Body_Sim                   38296 non-null float64
Head_compound                   38296 non-null float64
Head_neg                        38296 non-null float64
Head_neu                        38296 non-null float64
Head_pos                        38296 non-null float64
Body_compound                   38296 non-null float64
Body_neg                        38296 non-null float64
Body_neu                        38296 non-null float64
Body_pos                        38296 non-null float64
flesch_reading_ease             38296 non-null float64
smog_index                      38296 non-null float64
flesch_kincaid_grade            38296 non-null float64
coleman_liau_index              38296 non-null float64
automated_readability_index     38296 non-null float64
dale_chall_readability_score    38296 non-null float64
difficult_w

## Count and other features

In [28]:
cf_columns = ['count_of_Headline_unigram', 'count_of_unique_Headline_unigram', 
              'ratio_of_unique_Headline_unigram', 'count_of_Headline_bigram', 
              'count_of_unique_Headline_bigram', 'ratio_of_unique_Headline_bigram', 
              'count_of_Headline_trigram', 'count_of_unique_Headline_trigram', 
              'ratio_of_unique_Headline_trigram', 'count_of_articleBody_unigram', 
              'count_of_unique_articleBody_unigram', 'ratio_of_unique_articleBody_unigram', 
              'count_of_articleBody_bigram', 'count_of_unique_articleBody_bigram', 
              'ratio_of_unique_articleBody_bigram', 'count_of_articleBody_trigram', 
              'count_of_unique_articleBody_trigram', 'ratio_of_unique_articleBody_trigram', 
              'count_of_Headline_unigram_in_articleBody', 
              'ratio_of_Headline_unigram_in_articleBody', 
              'count_of_Headline_bigram_in_articleBody', 
              'ratio_of_Headline_bigram_in_articleBody', 'count_of_Headline_trigram_in_articleBody', 
              'ratio_of_Headline_trigram_in_articleBody', 'len_sent_Headline', 
              'len_sent_articleBody', 'fake_exist', 'fraud_exist', 'hoax_exist', 
              'false_exist', 'deny_exist', 'denies_exist', 'not_exist', 'despite_exist', 
              'nope_exist', 'doubt_exist', 'doubts_exist', 'bogus_exist', 'debunk_exist', 
              'pranks_exist', 'retract_exist']

In [30]:
# Opensources
Open_cf, Open_wv_head, Open_wv_body = get_count_wordvec(open_f)
# Kaggle
kaggle_cf, kaggle_wv_head, kaggle_wv_body = get_count_wordvec(kaggle_f)
# D3
d3_cf, d3_wv_head, d3_wv_body = get_count_wordvec(d3_f)

Count features shape (11161, 41)
Word_vec shape
Head:	(11161, 50) Body:	(11161, 50)
Count features shape (20800, 41)
Word_vec shape
Head:	(20800, 50) Body:	(20800, 50)
Count features shape (6335, 41)
Word_vec shape
Head:	(6335, 50) Body:	(6335, 50)


In [32]:
Open_cf

array([[7., 7., 1., ..., 0., 0., 0.],
       [9., 9., 1., ..., 0., 0., 0.],
       [2., 2., 1., ..., 0., 0., 0.],
       ...,
       [8., 8., 1., ..., 0., 0., 0.],
       [6., 6., 1., ..., 0., 0., 0.],
       [9., 9., 1., ..., 0., 0., 0.]])

In [34]:
# Dataframes
op_df = pd.DataFrame(Open_cf,columns=cf_columns)

k_df = pd.DataFrame(kaggle_cf,columns=cf_columns)

cf_df = pd.DataFrame(d3_cf,columns=cf_columns)



In [40]:
print(len(op_df))
op_df.head()

11161


Unnamed: 0,count_of_Headline_unigram,count_of_unique_Headline_unigram,ratio_of_unique_Headline_unigram,count_of_Headline_bigram,count_of_unique_Headline_bigram,ratio_of_unique_Headline_bigram,count_of_Headline_trigram,count_of_unique_Headline_trigram,ratio_of_unique_Headline_trigram,count_of_articleBody_unigram,...,denies_exist,not_exist,despite_exist,nope_exist,doubt_exist,doubts_exist,bogus_exist,debunk_exist,pranks_exist,retract_exist
0,7.0,7.0,1.0,6.0,6.0,1.0,5.0,5.0,1.0,312.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9.0,9.0,1.0,8.0,8.0,1.0,7.0,7.0,1.0,158.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,159.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7.0,7.0,1.0,6.0,6.0,1.0,5.0,5.0,1.0,119.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,4.0,1.0,3.0,3.0,1.0,2.0,2.0,1.0,173.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
print(len(k_df))
k_df.head()

20800


Unnamed: 0,count_of_Headline_unigram,count_of_unique_Headline_unigram,ratio_of_unique_Headline_unigram,count_of_Headline_bigram,count_of_unique_Headline_bigram,ratio_of_unique_Headline_bigram,count_of_Headline_trigram,count_of_unique_Headline_trigram,ratio_of_unique_Headline_trigram,count_of_articleBody_unigram,...,denies_exist,not_exist,despite_exist,nope_exist,doubt_exist,doubts_exist,bogus_exist,debunk_exist,pranks_exist,retract_exist
0,10.0,10.0,1.0,9.0,9.0,1.0,8.0,8.0,1.0,446.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7.0,7.0,1.0,6.0,6.0,1.0,5.0,5.0,1.0,379.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,5.0,1.0,4.0,4.0,1.0,3.0,3.0,1.0,703.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7.0,7.0,1.0,6.0,6.0,1.0,5.0,5.0,1.0,320.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10.0,9.0,0.9,9.0,9.0,1.0,8.0,8.0,1.0,89.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
print(len(cf_df))
cf_df.head()

6335


Unnamed: 0,count_of_Headline_unigram,count_of_unique_Headline_unigram,ratio_of_unique_Headline_unigram,count_of_Headline_bigram,count_of_unique_Headline_bigram,ratio_of_unique_Headline_bigram,count_of_Headline_trigram,count_of_unique_Headline_trigram,ratio_of_unique_Headline_trigram,count_of_articleBody_unigram,...,denies_exist,not_exist,despite_exist,nope_exist,doubt_exist,doubts_exist,bogus_exist,debunk_exist,pranks_exist,retract_exist
0,3.0,3.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,681.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11.0,11.0,1.0,10.0,10.0,1.0,9.0,9.0,1.0,247.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,5.0,1.0,4.0,4.0,1.0,3.0,3.0,1.0,244.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.0,8.0,1.0,7.0,7.0,1.0,6.0,6.0,1.0,264.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.0,6.0,1.0,5.0,5.0,1.0,4.0,4.0,1.0,183.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# Combined dataframe
cf_combined = pd.concat([op_df, k_df, cf_df], ignore_index = True)
# Adding Target and Dataset columns
cf_combined['target'] = f_combined['target']
cf_combined['dataset'] = f_combined['dataset']
cf_combined.head()

Unnamed: 0,count_of_Headline_unigram,count_of_unique_Headline_unigram,ratio_of_unique_Headline_unigram,count_of_Headline_bigram,count_of_unique_Headline_bigram,ratio_of_unique_Headline_bigram,count_of_Headline_trigram,count_of_unique_Headline_trigram,ratio_of_unique_Headline_trigram,count_of_articleBody_unigram,...,despite_exist,nope_exist,doubt_exist,doubts_exist,bogus_exist,debunk_exist,pranks_exist,retract_exist,target,dataset
0,7.0,7.0,1.0,6.0,6.0,1.0,5.0,5.0,1.0,312.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,REAL,Dataset - 1
1,9.0,9.0,1.0,8.0,8.0,1.0,7.0,7.0,1.0,158.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FAKE,Dataset - 1
2,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,159.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FAKE,Dataset - 1
3,7.0,7.0,1.0,6.0,6.0,1.0,5.0,5.0,1.0,119.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FAKE,Dataset - 1
4,4.0,4.0,1.0,3.0,3.0,1.0,2.0,2.0,1.0,173.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FAKE,Dataset - 1


In [45]:
f_combined.to_csv('features_combined.csv', index = False)
cf_combined.to_csv('count_features_combined.csv', index = False)
print('Done')

Done


In [53]:
# sanity check
f_combined[f_combined['dataset'] == 'Dataset - 2']['target'].value_counts()

FAKE    10413
REAL    10387
Name: target, dtype: int64

In [54]:
cf_combined[cf_combined['dataset'] == 'Dataset - 2']['target'].value_counts()

FAKE    10413
REAL    10387
Name: target, dtype: int64