# Creating a dataset from the news articles' content in each data source (politifact or gossipcop) and class (real or fake).

In [1]:
import pandas as pd
import json
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import configs

In [2]:
# settings from configs.py file

dataset = configs.data_source # 'politifact' or 'gossipcop'
label_or_class = configs.fake_or_real # 'fake' or 'real'
label = 1 if label_or_class == 'fake' else 0 # for Fake folder 1, for Real folder 0

# Directory consisting downloaded dataset and twitter files
source = '{}/{}/{}/'.format(configs.dic_source, dataset, label_or_class) 

In [4]:
f_names = [os.path.join(f, 'news content.json') for f in os.listdir(source)
           if os.path.exists(os.path.join(source, f, 'news content.json')) and
           os.path.exists(os.path.join(source, f, 'tweets')) and
           os.path.exists(os.path.join(source, f, 'retweets'))]

In [None]:
print("All news: ", len(os.listdir(source)))
print("The news with social context:", len(f_names))

In [7]:
# Creating an empty Dataframe with column names only
df = pd.DataFrame(columns=['F_name', 'Title', 'Text', 'Label'])

for f_name in f_names:
    # Opening JSON file
    f = open(os.path.join(source, f_name))
    
    # returns JSON object as a dictionary
    data = json.load(f)
    
    # Append rows in Empty Dataframe by adding dictionaries
    row_df = pd.DataFrame([{'F_name': f_name,
                            'Title': data['title'].replace('\n', ' ').replace('\r', ''),
                            'Text': data['text'].replace('\n', ' ').replace('\r', ''),
                            'Label': label}])
    
    df = pd.concat([df,row_df], ignore_index=True)

    # Closing file
    f.close()

In [None]:
df.shape

In [9]:
df.head()

Unnamed: 0,F_name,Title,Text,Label
0,gossipcop-846866/news content.json,Who is the most exceptional singer in One Dire...,Something went wrong. Wait a moment and try ag...,1
1,gossipcop-873992/news content.json,Ellen DeGeneres Wrote The Sweetest Message To ...,"Ellen DeGeneres is a talk show host, comedienn...",1
2,gossipcop-924229/news content.json,Corey Feldman Shares ‘Smallest Knife Wound’ Af...,Corey Feldman shared a photo of his scar to up...,1
3,gossipcop-913272/news content.json,Nextdivas.com Is For Sale,The domain nextdivas.com is for sale. To purch...,1
4,gossipcop-896283/news content.json,Tiffany Haddish Teaches Barbra Streisand About...,If Barbra Streisand drops a cover of “Bodak Ye...,1


## Adding the length of text and length of the title to the features

In [10]:
df['Title_len'] = df.Title.apply(lambda x: len(x))

In [11]:
df['Text_len'] = df.Text.apply(lambda x: len(x))

In [12]:
df.head()

Unnamed: 0,F_name,Title,Text,Label,Title_len,Text_len
0,gossipcop-846866/news content.json,Who is the most exceptional singer in One Dire...,Something went wrong. Wait a moment and try ag...,1,52,61
1,gossipcop-873992/news content.json,Ellen DeGeneres Wrote The Sweetest Message To ...,"Ellen DeGeneres is a talk show host, comedienn...",1,96,2162
2,gossipcop-924229/news content.json,Corey Feldman Shares ‘Smallest Knife Wound’ Af...,Corey Feldman shared a photo of his scar to up...,1,66,2335
3,gossipcop-913272/news content.json,Nextdivas.com Is For Sale,The domain nextdivas.com is for sale. To purch...,1,25,132
4,gossipcop-896283/news content.json,Tiffany Haddish Teaches Barbra Streisand About...,If Barbra Streisand drops a cover of “Bodak Ye...,1,54,1999


In [13]:
df['Content'] = df['Title'] + " " + df['Text']

In [14]:
df = df.drop(['Title', 'Text'], axis=1)

In [15]:
df.head()

Unnamed: 0,F_name,Label,Title_len,Text_len,Content
0,gossipcop-846866/news content.json,1,52,61,Who is the most exceptional singer in One Dire...
1,gossipcop-873992/news content.json,1,96,2162,Ellen DeGeneres Wrote The Sweetest Message To ...
2,gossipcop-924229/news content.json,1,66,2335,Corey Feldman Shares ‘Smallest Knife Wound’ Af...
3,gossipcop-913272/news content.json,1,25,132,Nextdivas.com Is For Sale The domain nextdivas...
4,gossipcop-896283/news content.json,1,54,1999,Tiffany Haddish Teaches Barbra Streisand About...


## Adding the number of positive and negative and sentiment score to the features

In [16]:
def pos_counting(text_input):
    file_positives = open("positive.txt",'r')
    positives = file_positives.read().replace("\n"," ").lower().split()
    
    # Count the frequency of positive and negative words in each tweet
    tokens = text_input.split()
    pos_count = 0
    for token in tokens:
        if token.strip().lower() in positives:
            pos_count += 1
    return pos_count

In [17]:
def neg_counting(text_input):
    file_negatives = open("negative.txt",'r')
    negatives = file_negatives.read().replace("\n"," ").lower().split()
    
    # Count the frequency of positive and negative words in each tweet
    tokens = text_input.split()
    neg_count = 0
    for token in tokens:
        if token.strip().lower() in negatives:
            neg_count += 1
    return neg_count

In [18]:
def sentiment_score(text_input):
    sid_obj = SentimentIntensityAnalyzer()
    
    # Calculate the sentiment value of the tweet
    sentiment_dict = sid_obj.polarity_scores(text_input)
    return sentiment_dict['compound']

In [19]:
df['pos_count'] = df.Content.apply(pos_counting)

In [20]:
df['neg_count'] = df.Content.apply(neg_counting)

In [21]:
df['sentiment_score'] = df.Content.apply(sentiment_score)

In [22]:
df.head()

Unnamed: 0,F_name,Label,Title_len,Text_len,Content,pos_count,neg_count,sentiment_score
0,gossipcop-846866/news content.json,1,52,61,Who is the most exceptional singer in One Dire...,1,0,-0.4767
1,gossipcop-873992/news content.json,1,96,2162,Ellen DeGeneres Wrote The Sweetest Message To ...,15,3,0.9984
2,gossipcop-924229/news content.json,1,66,2335,Corey Feldman Shares ‘Smallest Knife Wound’ Af...,6,15,-0.8664
3,gossipcop-913272/news content.json,1,25,132,Nextdivas.com Is For Sale The domain nextdivas...,0,0,0.0
4,gossipcop-896283/news content.json,1,54,1999,Tiffany Haddish Teaches Barbra Streisand About...,13,7,0.9911


In [None]:
df.shape

In [None]:
#Fixing F_name feature
df_fake['F_name'] = df_fake.F_name.apply(lambda x: x.split('/')[0])
df_real['F_name'] = df_real.F_name.apply(lambda x: x.split('/')[0])

## Storing df in a json file. 

In [25]:
# storing data in JSON format
dataset_name = '{}_{}_new.json'.format(dataset, label_or_class)
df.to_json(dataset_name, orient = 'table', index=False, compression = 'infer')

In [26]:
f = open(dataset_name)
t = json.load(f)
f.close()

In [None]:
len(t['data'])

## Making the dataset cleaned and balanced

In [24]:
# reading the JSON file
dataset_fake = '{}_fake_new.json'.format(dataset)
dataset_real = '{}_real_new.json'.format(dataset)
df_fake = pd.read_json(dataset_fake, orient ='table', compression = 'infer')
df_real = pd.read_json(dataset_real, orient ='table', compression = 'infer')

In [None]:
df_fake.shape, df_real.shape

In [None]:
df_real[(df_real.Content == ' ')].shape

In [328]:
df_real = df_real[~(df_real.Content == ' ')]

In [None]:
df_real.shape

In [330]:
df_fake = df_fake[~(df_fake.Content == ' ')]

In [None]:
df_fake.shape

In [332]:
df_real = df_real[~(df_real.Text_len < 70)]

In [None]:
df_real.shape

In [334]:
df_fake = df_fake[~(df_fake.Text_len < 70)]

In [None]:
df_fake.shape

In [341]:
# storing data in JSON format
dataset_name = '{}_real_balanced.json'.format(dataset)
df_real.to_json(dataset_name, orient = 'table', index=False, compression = 'infer')
dataset_name = '{}_fake_balanced.json'.format(dataset)
df_fake.to_json(dataset_name, orient = 'table', index=False, compression = 'infer')