In [1]:
import os
import sys
import json
import csv
import time
import codecs

In [2]:
# Return Directory's file names
def FolderFiles(folder, directory):
    os.chdir(directory)
    for folderName, subFolders, fileNames in os.walk(folder):
        fileNames = filter(lambda filename: filename.endswith('.txt'), fileNames)
        return list(fileNames)

In [3]:
# Filtering Twitter API
# date, text, tags, user, status
def FilterTweet(tweet):
    date = tweet['created_at']
    text = tweet['text']
    tags = list(map(lambda tag: tag['text'],
               tweet['entities']['hashtags']))
    user = tweet["user"]["screen_name"]
    status = 'tweet'

    if 'retweeted_status' in tweet:
        retweeted = tweet['retweeted_status']
        text = retweeted['text']
        tags = list(map(lambda tag: tag['text'],
           retweeted['entities']['hashtags']))
        status = 'retweet'

    elif 'quoted_status' in tweet:
        quoted = tweet['quoted_status']
        text = text + '. ' + quoted['text']
        qt_tags = list(map(lambda tag: tag['text'],
           quoted['entities']['hashtags']))
        tags = list(set(tags + qt_tags))
        status = 'quoted'
    
    tweet_list = [
        date,
        text,
        ','.join(map(lambda tag: tag.lower(), tags)),
        user,
        status
    ]
    
    return tweet_list

In [4]:
def SaveListAppendToCSV(path_to_save, file_name, my_list):
    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)
    os.chdir(path_to_save)
    
    with open(file_name[:-4] + '.csv', 'a') as file:
        wr = csv.writer(file, quoting=csv.QUOTE_ALL)
        wr.writerow(my_list)

In [5]:
RAW_DATA_PATH = "/Users/Bya/Dropbox/Research/datas/EPL/TwitterRawJsonData/"
SAVE_DATA_PATH = "/Users/Bya/Dropbox/Research/datas/EPL/ExtractedCsvData/"

In [6]:
def RawTweetFilterToCSV(week):
    week_folder = 'GW' + str(week)
    week_files = FolderFiles(week_folder, RAW_DATA_PATH)

    os.chdir(RAW_DATA_PATH + week_folder)

    for file_name in week_files:
        start_time = time.time()
        
        path_to_read = RAW_DATA_PATH + week_folder
        tweets = ReadTextLineByLine(file_name, path_to_read)

        for i in range(len(tweets)):
            try:
                tweet_all = json.loads(tweets[i])
                tweet_list = FilterTweet(tweet_all)

                path_to_save = SAVE_DATA_PATH + week_folder
                SaveListAppendToCSV(path_to_save, file_name, tweet_list)

            except ValueError:
                continue
            except KeyError:
                continue

        print("[Converting Done]: %s (%.2f sec)" % (file_name, time.time() - start_time))

In [7]:
def ReadTextLineByLine(file_name, directory):
    os.chdir(directory)
    
    with open(file_name) as file:
        tweets = (line.rstrip() for line in file)

        # removing blank lines
        tweets = list(line for line in tweets if line)
    
    return tweets

In [8]:
RawTweetFilterToCSV(4)

[Converting Done]: 1508292045_NUFCvsARSENAL_json.txt (5.63 sec)
[Converting Done]: 1508292300_6matches_json.txt (9.93 sec)
[Converting Done]: 1508302130_SAINTSvsNCFC_json.txt (2.18 sec)
[Converting Done]: 1508310000_SWANSvsMUFC_json.txt (10.35 sec)


In [10]:
weeks = [4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15]

[RawTweetFilterToCSV(week) for week in weeks]

[Converting Done]: 1508292045_NUFCvsARSENAL_json.txt (5.59 sec)
[Converting Done]: 1508292300_6matches_json.txt (10.50 sec)
[Converting Done]: 1508302130_SAINTSvsNCFC_json.txt (1.93 sec)
[Converting Done]: 1508310000_SWANSvsMUFC_json.txt (10.58 sec)
[Converting Done]: 1509122045_EFCvsCFC_json.txt (9.74 sec)
[Converting Done]: 1509122300_5matches_json.txt (8.16 sec)
[Converting Done]: 1509130130_MUFCvsLFC_json.txt (16.86 sec)
[Converting Done]: 1509132130_SAFCvsCOYS_json.txt (1.73 sec)
[Converting Done]: 15091400_LCFCvsAVFC_json.txt (2.79 sec)
[Converting Done]: 1509150400_WHUFCvsNUFC_json.txt (2.35 sec)
[Converting Done]: 1509192045_CFCvsARSENAL_json.txt (7.38 sec)
[Converting Done]: 1509192300_5matches_json.txt (6.03 sec)
[Converting Done]: 1509200130_MCFCvsWHU_json.txt (4.79 sec)
[Converting Done]: 1509202130_COYSvsCPFC_json.txt (1.01 sec)
[Converting Done]: 1509210000_2matches_json.txt (20.17 sec)
[Converting Done]: 1509262045_COYSvsMCFC_json.txt (6.04 sec)
[Converting Done]: 150926

[None, None, None, None, None, None, None, None, None, None, None]

In [11]:
# Return Directory's file names
def FolderFiles(folder, directory):
    os.chdir(directory)
    for folderName, subFolders, fileNames in os.walk(folder):
        fileNames = filter(lambda filename: filename.endswith('.csv'), fileNames)
        return list(fileNames)

In [24]:
FolderFiles('GW15', '/Users/Bya/Dropbox/Research/datas/EPL/ExtractedCsvData/')

['game1.csv', 'game10.csv', 'game2_7.csv', 'game8.csv', 'game9.csv']