In [12]:
#load dependencies
import networkx as nx 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Extract the data

In [3]:
#GLOBAL CONSTANTS
FILE_PATH = 'Data/csv/ira_tweets_00%s.csv'
FILE_INDEX = [str(i) for i in range(0,10)]

In [42]:
filename = FILE_PATH % FILE_INDEX[0]
chunksize = 10000
df = pd.read_csv(filename, chunksize=chunksize)

In [None]:
df = optimization(pd.read_csv(filename))

In [13]:
#Calculate the memory use
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [35]:
#Optimize data frame
def optimization(df):
    print('INTEGER OPTIMIZATION: ')
    df_int = df.select_dtypes(include=['int64'])
    converted_int = df_int.apply(pd.to_numeric,downcast='unsigned')

    print('(before)', mem_usage(df_int))
    print('(after)', mem_usage(converted_int))

    compare_ints = pd.concat([df_int.dtypes,converted_int.dtypes],axis=1)

    print('FLOAT OPTIMIZATION:')
    df_float = df.select_dtypes(include=['float'])
    converted_float = df_float.apply(pd.to_numeric,downcast='float')

    print('(before)', mem_usage(df_float))
    print('(after)', mem_usage(converted_float))

    compare_floats = pd.concat([df_float.dtypes,converted_float.dtypes],axis=1)
  

    print('OBJECT OPTIMIZATION:')
    df_obj = df.select_dtypes(include=['object']).copy()

    converted_obj = pd.DataFrame()

    for col in df_obj.columns:
        num_unique_values = len(df_obj[col].unique())
        num_total_values = len(df_obj[col])
        if num_unique_values / num_total_values < 0.3:
            converted_obj.loc[:,col] = df_obj[col].astype('category')
        else:
            converted_obj.loc[:,col] = df_obj[col]
        
    print(mem_usage(df_obj))
    print(mem_usage(converted_obj))

    compare_obj = pd.concat([df_obj.dtypes,converted_obj.dtypes],axis=1)


    optimized_df = df.copy()
    optimized_df[converted_int.columns] = converted_int
    optimized_df[converted_float.columns] = converted_float
    optimized_df[converted_obj.columns] = converted_obj

    print('TOTAL OPTIMIZATION:')
    print('(before)', mem_usage(df))
    print('(after)', mem_usage(optimized_df))
    return optimized_df

In [54]:
from collections import defaultdict

lang_frq = defaultdict(int)

cs = 10000
for index in FILE_INDEX:
    path = FILE_PATH %index
    for chunk in pd.read_csv(path, chunksize = cs):
        for c in chunk['tweet_language']:
            lang_frq[c] +=1
            
print(lang_frq)

defaultdict(<class 'int'>, {'ru': 4853185, 'bg': 54690, 'en': 3261931, 'und': 230713, 'pl': 1921, 'de': 99332, 'is': 279, nan: 296106, 'uk': 82237, 'it': 20376, 'lt': 1453, 'tr': 3927, 'lv': 525, 'vi': 284, 'da': 3483, 'sr': 8045, 'in': 10325, 'es': 12802, 'tl': 6962, 'cs': 1070, 'ht': 6650, 'ar': 37245, 'hr': 297, 'et': 6146, 'fr': 12636, 'sl': 2347, 'cy': 2677, 'no': 1605, 'sv': 1863, 'nl': 3541, 'ro': 3161, 'pt': 2904, 'sk': 3343, 'eu': 297, 'hi': 311, 'ja': 2660, 'ko': 289, 'fi': 1794, 'bs': 615, 'hu': 519, 'fa': 37, 'hy': 23, 'ur': 17, 'id': 466, 'zh': 57, 'el': 35, 'ml': 2, 'th': 40, 'iw': 56, 'ne': 7, 'mr': 2, 'he': 2, 'bn': 3, 'km': 3, 'ta': 4, 'si': 5, 'iu': 1, 'ps': 1, 'ug': 1})


In [53]:
print(lang_frq)

[('ru', 4853185), ('en', 3261931), (nan, 296106), ('und', 230713), ('de', 99332), ('uk', 82237), ('bg', 54690), ('ar', 37245), ('it', 20376), ('es', 12802), ('fr', 12636), ('in', 10325), ('sr', 8045), ('tl', 6962), ('ht', 6650), ('et', 6146), ('tr', 3927), ('nl', 3541), ('da', 3483), ('sk', 3343), ('ro', 3161), ('pt', 2904), ('cy', 2677), ('ja', 2660), ('sl', 2347), ('pl', 1921), ('sv', 1863), ('fi', 1794), ('no', 1605), ('lt', 1453), ('cs', 1070), ('bs', 615), ('lv', 525), ('hu', 519), ('id', 466), ('hi', 311), ('hr', 297), ('eu', 297), ('ko', 289), ('vi', 284), ('is', 279), ('zh', 57), ('iw', 56), ('th', 40), ('fa', 37), ('el', 35), ('hy', 23), ('ur', 17), ('ne', 7), ('si', 5), ('ta', 4), ('bn', 3), ('km', 3), ('ml', 2), ('mr', 2), ('he', 2), ('iu', 1), ('ps', 1), ('ug', 1)]
