In [133]:
import pandas as pd
import emoji  # needs to be installed with pip
import regex # needs to be installed with pip
import re
import string
from collections import Counter
from datetime import datetime

In [134]:
parser = lambda x: datetime.strptime(x,'%Y-%m-%d %X') 
df_all_orgas = pd.read_csv("df_all_full_without_rt.csv",
                             index_col=0,parse_dates =['created_at'], date_parser=parser)

In [135]:
# introduce temporary column to seperate words
df_all_orgas["list_of_words"] = df_all_orgas['text'].apply(lambda x: x.split()) 

In [136]:
# count length of tweet
df_all_orgas["characters per tweet"] = df_all_orgas['text'].apply(lambda x: len(x))
df_all_orgas["words per tweet"] = df_all_orgas['text'].apply(lambda x: len(x.split()))

In [137]:
# identify word shape: 
# all upper
df_all_orgas["nb_upper"] = df_all_orgas["list_of_words"].apply(lambda x: sum(1 for c in x if c.isupper()))
# all lower
df_all_orgas["nb_lower"] = df_all_orgas["list_of_words"].apply(lambda x: sum(1 for c in x if c.islower()))

# capitalized  #first char upper, others lower 
count_capitalized  = lambda x: sum([(a[0].isupper() and a[1:].islower()) for a in x if len(a)>1])
df_all_orgas["nb_capitalized"] = df_all_orgas["list_of_words"].apply(count_capitalized)


# camelcase https://www.dictionary.com/browse/camelcase
# simplification = word element beginning with an uppercase letter that is not in nb_upper and not in
# nb_capitalized 
count_camel_case = lambda x: sum([(a[0].isupper() and
                                       (not a[1:].islower() and not a[1:].isupper())) for a in x if len(a)>1])
df_all_orgas["mixed_upper_lower_not_capitalized"] = df_all_orgas["list_of_words"].apply(count_camel_case)

# count how many words have the length x+1 for x in range(20)
for i in range(20):
    count_len = lambda x: sum([len(a)==i+1 for a in x])
    df_all_orgas[f"nb_len_{i+1}"] = df_all_orgas["list_of_words"].apply(count_len)
    
# remove the help column 
df_all_orgas.drop("list_of_words", axis=1,inplace=True)


In [138]:
# df_all_orgas = df_all_orgas.reset_index(drop=True)

In [139]:
df_all_orgas.head() 

Unnamed: 0,agency,created_at,text,date,characters per tweet,words per tweet,nb_upper,nb_lower,nb_capitalized,mixed_upper_lower_not_capitalized,...,nb_len_11,nb_len_12,nb_len_13,nb_len_14,nb_len_15,nb_len_16,nb_len_17,nb_len_18,nb_len_19,nb_len_20
1340367556947943425,yahoonews,2020-12-19 18:45:03,"A $20,000 fee to change a wedding date? New bi...",2020-12-19 00:00:00,151,21,1,17,1,0,...,0,0,0,0,0,0,0,0,0,0
1340329823282016256,yahoonews,2020-12-19 16:15:06,U.S. intelligence agencies are increasingly fo...,2020-12-19 00:00:00,154,15,1,10,2,0,...,1,2,0,0,0,0,0,0,0,0
1340307168923414531,yahoonews,2020-12-19 14:45:05,Rebuilding trust in the Justice Department sta...,2020-12-19 00:00:00,157,20,0,12,4,0,...,0,0,0,0,0,0,0,0,0,0
1340141062724194306,yahoonews,2020-12-19 03:45:02,Biden says Lindsey Graham is a 'personal disap...,2020-12-19 00:00:00,147,16,0,11,3,0,...,1,0,0,0,2,0,0,0,0,0
1340065561523458048,yahoonews,2020-12-18 22:45:01,75 people and counting test positive for COVID...,2020-12-18 00:00:00,142,16,1,9,3,0,...,0,0,0,0,0,0,0,0,0,0


In [140]:
df_all_orgas.columns

Index(['agency', 'created_at', 'text', 'date', 'characters per tweet',
       'words per tweet', 'nb_upper', 'nb_lower', 'nb_capitalized',
       'mixed_upper_lower_not_capitalized', 'nb_len_1', 'nb_len_2', 'nb_len_3',
       'nb_len_4', 'nb_len_5', 'nb_len_6', 'nb_len_7', 'nb_len_8', 'nb_len_9',
       'nb_len_10', 'nb_len_11', 'nb_len_12', 'nb_len_13', 'nb_len_14',
       'nb_len_15', 'nb_len_16', 'nb_len_17', 'nb_len_18', 'nb_len_19',
       'nb_len_20'],
      dtype='object')

In [141]:
df_all_orgas['text'].str.lower()

1340367556947943425    a $20,000 fee to change a wedding date? new bi...
1340329823282016256    u.s. intelligence agencies are increasingly fo...
1340307168923414531    rebuilding trust in the justice department sta...
1340141062724194306    biden says lindsey graham is a 'personal disap...
1340065561523458048    75 people and counting test positive for covid...
                                             ...                        
1316355491946213376    your dog has a mid-life crisis too! pet pooche...
1316314715434430464    thai pro-democracy protesters fight with royal...
1316309906010583045    fisherman is filmed holding dolphin calf under...
1316291670217052166    thai pro-democracy protesters fight with royal...
1316036386554884096    12,000lb raf 'tallboy' bomb explodes while bei...
Name: text, Length: 20796, dtype: object

In [142]:
letters = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z",
           str(0),"1","2","3","4","5","6","7","8","9","?","!","@","$","&","#","%","(",")","-","+","=","/"]

In [143]:
lst = []
for index, row in df_all_orgas.iterrows(): 
    chars_in_string = Counter(row["text"])
    res = []
    for letter in letters:
        if letter in chars_in_string:
            res.append(chars_in_string[letter])
        else: 
            res.append(0) 
    lst.append(res)

df_temp = pd.DataFrame(lst, columns=letters, index=df_all_orgas.index)

In [144]:
df_temp.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,$,&,#,%,(,),-,+,=,/
1340367556947943425,9,2,6,4,10,3,3,4,4,1,...,1,0,0,0,0,0,0,0,0,6
1340329823282016256,7,0,7,2,14,1,5,3,9,0,...,0,0,0,0,0,0,0,0,0,6
1340307168923414531,4,2,5,4,12,2,2,5,9,0,...,0,0,0,0,0,0,0,0,0,6
1340141062724194306,7,1,4,5,9,1,2,5,9,0,...,0,0,0,0,0,0,1,0,0,6
1340065561523458048,6,0,6,1,6,3,1,6,6,0,...,0,0,0,0,0,0,0,0,0,6


In [145]:
df_all_orgas = df_all_orgas.join(df_temp)

In [146]:
df_all_orgas.head()

Unnamed: 0,agency,created_at,text,date,characters per tweet,words per tweet,nb_upper,nb_lower,nb_capitalized,mixed_upper_lower_not_capitalized,...,$,&,#,%,(,),-,+,=,/
1340367556947943425,yahoonews,2020-12-19 18:45:03,"A $20,000 fee to change a wedding date? New bi...",2020-12-19 00:00:00,151,21,1,17,1,0,...,1,0,0,0,0,0,0,0,0,6
1340329823282016256,yahoonews,2020-12-19 16:15:06,U.S. intelligence agencies are increasingly fo...,2020-12-19 00:00:00,154,15,1,10,2,0,...,0,0,0,0,0,0,0,0,0,6
1340307168923414531,yahoonews,2020-12-19 14:45:05,Rebuilding trust in the Justice Department sta...,2020-12-19 00:00:00,157,20,0,12,4,0,...,0,0,0,0,0,0,0,0,0,6
1340141062724194306,yahoonews,2020-12-19 03:45:02,Biden says Lindsey Graham is a 'personal disap...,2020-12-19 00:00:00,147,16,0,11,3,0,...,0,0,0,0,0,0,1,0,0,6
1340065561523458048,yahoonews,2020-12-18 22:45:01,75 people and counting test positive for COVID...,2020-12-18 00:00:00,142,16,1,9,3,0,...,0,0,0,0,0,0,0,0,0,6


In [147]:
lst = []

for index, row in df_all_orgas.iterrows(): 
    non_ASCII = 0
    for c in row["text"]:
        if 0 <= ord(c) <= 127:
            pass
        else:
            non_ASCII += 1
            # this is a non-ascii character.
     
    lst.append(non_ASCII)

df_temp = pd.DataFrame({'non_ASCII':lst},index=df_all_orgas.index)

In [148]:
df_temp.head()

Unnamed: 0,non_ASCII
1340367556947943425,0
1340329823282016256,0
1340307168923414531,2
1340141062724194306,0
1340065561523458048,1


In [149]:
df_all_orgas = df_all_orgas.join(df_temp)

In [150]:
df_all_orgas.head()

Unnamed: 0,agency,created_at,text,date,characters per tweet,words per tweet,nb_upper,nb_lower,nb_capitalized,mixed_upper_lower_not_capitalized,...,&,#,%,(,),-,+,=,/,non_ASCII
1340367556947943425,yahoonews,2020-12-19 18:45:03,"A $20,000 fee to change a wedding date? New bi...",2020-12-19 00:00:00,151,21,1,17,1,0,...,0,0,0,0,0,0,0,0,6,0
1340329823282016256,yahoonews,2020-12-19 16:15:06,U.S. intelligence agencies are increasingly fo...,2020-12-19 00:00:00,154,15,1,10,2,0,...,0,0,0,0,0,0,0,0,6,0
1340307168923414531,yahoonews,2020-12-19 14:45:05,Rebuilding trust in the Justice Department sta...,2020-12-19 00:00:00,157,20,0,12,4,0,...,0,0,0,0,0,0,0,0,6,2
1340141062724194306,yahoonews,2020-12-19 03:45:02,Biden says Lindsey Graham is a 'personal disap...,2020-12-19 00:00:00,147,16,0,11,3,0,...,0,0,0,0,0,1,0,0,6,0
1340065561523458048,yahoonews,2020-12-18 22:45:01,75 people and counting test positive for COVID...,2020-12-18 00:00:00,142,16,1,9,3,0,...,0,0,0,0,0,0,0,0,6,1


In [151]:
df_all_orgas.to_csv('style_data.csv')

In [152]:
df_all_orgas["0"].head()

1340367556947943425    5
1340329823282016256    2
1340307168923414531    1
1340141062724194306    0
1340065561523458048    0
Name: 0, dtype: int64