In [57]:
import pandas as pd

In [58]:
df_data = pd.read_csv('A1_dataset.csv')

In [59]:
df_data

Unnamed: 0,LABEL,DATE_TIME,TEXT
0,0,Fri Jun 05 14:26:50 2009,About to get threaded and scared
1,1,Thu May 14 10:13:55 2009,@awaisnaseer I like Shezan Mangooo too!!! I ha...
2,1,Fri Jun 05 21:02:20 2009,worked on my car after work. showering then go...
3,1,Sun Jun 14 22:25:52 2009,@Marama Actually we start this afternoon! I w...
4,1,Sun May 31 00:42:12 2009,@gfalcone601 Aww Gi.don't worry.we'll vote for...
...,...,...,...
4282,1,Sat Jun 06 22:45:26 2009,@QandQ My performances on my CLEP tests. #qshock
4283,0,Tue Jun 16 10:17:07 2009,"ugh no, rcn had all the true blood episodes on..."
4284,1,Fri May 01 22:00:42 2009,Just returned from the forest! Sarah (my merch...
4285,1,Sun Jun 07 02:09:46 2009,is proud of her dad and his piece of work. ( h...


In [61]:
from customregexes import *

# I. REGULAR EXPRESSION

# A.

## a. Average number of sentences and tokens 
- ! or ? (Even Continous)
- . Followed by one or more spaces and then a capital character

In [62]:
df_data['sentence_count'] = df_data['TEXT'].apply(lambda x: findSentenceCount(x))

In [63]:
df_data.groupby(['LABEL'])['sentence_count'].mean()

LABEL
0    1.800500
1    1.854832
Name: sentence_count, dtype: float64

In [65]:
df_data['token_count'] = df_data['TEXT'].apply(lambda x: findTokenCount(x))

In [66]:
df_data['tokens'] = df_data['TEXT'].apply(lambda x: findTokens(x))

In [67]:
df_data.groupby(['LABEL'])['token_count'].mean()

LABEL
0    14.811500
1    14.139921
Name: token_count, dtype: float64

## b. Total number of words starting with consonants and vowels
- Words starting with either Consonant or Vowel (depending on subpart) followed by either an alphabet, an accented alphabet, an apostrophe or a hyphen.
- This was done to ensure no cases of say spanish text which creep in twitter data from countries like the U.S. are not missed


In [72]:
df_data['words_starting_with_vowel'] = df_data['TEXT'].apply(lambda x: countWordsStartingWithVowel(x))

In [73]:
df_data['words_starting_with_vowel'].sum()

14177

In [74]:
df_data.groupby(['LABEL'])['words_starting_with_vowel'].sum()

LABEL
0    6989
1    7188
Name: words_starting_with_vowel, dtype: int64

In [75]:
df_data['words_starting_with_consonant'] = df_data['TEXT'].apply(lambda x: countWordsStartingWithConsonant(x))

In [76]:
df_data['words_starting_with_consonant'].sum()

37196

In [77]:
df_data.groupby(['LABEL'])['words_starting_with_consonant'].sum()

LABEL
0    17995
1    19201
Name: words_starting_with_consonant, dtype: int64

## c. Lowercase the text and report the number of unique tokens present before and after lower casing.

Unique Tokens Before Splitting

In [19]:
def get_unique_tokens(tokens):
    unique_tokens = set()
    for token_list in tokens:
        for token in token_list:
            unique_tokens.add(token)
    return len(unique_tokens)

Total

In [20]:
tokens = df_data['tokens'].to_list()
df_data['lowercase_text'] = df_data['TEXT'].apply(lambda x: lowercase(x))
df_data['lowercase_tokens'] = df_data['lowercase_text'].apply(lambda x: findTokens(x))
lowercase_tokens = df_data['lowercase_tokens'].to_list()

In [78]:
print("Unique Tokens", get_unique_tokens(tokens))
print("Unique Lowercase Tokens", get_unique_tokens(lowercase_tokens))

Unique Tokens 13167
Unique Lowercase Tokens 11565


Label 0

In [22]:
tokens_LABEL_0 = df_data[df_data['LABEL'] == 0]['tokens'].to_list()
lowercase_tokens_LABEL_0 = df_data[df_data['LABEL'] == 0]['lowercase_tokens'].to_list()

In [79]:
print("Unique Tokens", get_unique_tokens(tokens_LABEL_0))
print("Unique Lowercase Tokens", get_unique_tokens(lowercase_tokens_LABEL_0))

Unique Tokens 6991
Unique Lowercase Tokens 6215


Label 1

In [24]:
tokens_LABEL_1 = df_data[df_data['LABEL'] == 1]['tokens'].to_list()
lowercase_tokens_LABEL_1 = df_data[df_data['LABEL'] == 1]['lowercase_tokens'].to_list()

In [80]:
print("Unique Tokens", get_unique_tokens(tokens_LABEL_1))
print("Unique Lowercase Tokens", get_unique_tokens(lowercase_tokens_LABEL_1))

Unique Tokens 8622
Unique Lowercase Tokens 7615


## d. Count and list all the usernames.

Basic Rules and Assumptions:

- According to [Twitter Guidelines](https://help.twitter.com/en/managing-your-account/twitter-username-rules)
  - Your username cannot be longer than 15 characters. Your name can be longer (50 characters) or shorter than 4 characters, but usernames are kept shorter for the sake of ease.
  - A username can only contain alphanumeric characters (letters A-Z, numbers 0-9) with the exception of underscores, as noted above. Check to make sure your desired username doesn't contain any symbols, dashes, or spaces.
  - *Optional Rule to Spot Users* - Usernames containing the words Twitter or Admin cannot be claimed. No account names can contain Twitter or Admin unless they are official Twitter accounts.

- Some experimentation with our Twitter Handle helped us reach the following conclusions:
  - @UserName can't be placed with other alphanumeric characters so abc@user 9@user are not valid and won't tag the user
  - X@UserName where X is a punctuation is valid but the following cases also don't allow tagging - 
    - @@xyz
    - _@xyz

Hence we propose regexes which actually find real tag matches instead of say a user just writing an @ somewhere in the tweet and it getting falsely matched as a tagged user when it really isn't

In [81]:
df_data['UserNames'] = df_data['TEXT'].apply(lambda x: findUsernames(x))
df_data['UserNamesCounts'] = df_data['TEXT'].apply(lambda x: findUsernameCount(x))

In [82]:
df_data['UserNamesCounts'].sum()

2108

In [88]:
print("Username Count Label 0 - " , df_data[df_data['LABEL'] == 0]['UserNamesCounts'].sum())

Username Count Label 0 -  803


In [89]:
print("Username Count Label 1 - " , df_data[df_data['LABEL'] == 1]['UserNamesCounts'].sum())

Username Count Label 1 -  1305


In [90]:
usernames = df_data['UserNames'].to_numpy()
flattened_usernames = []
for i in usernames:
  flattened_usernames.extend(i)

In [91]:
len(set(flattened_usernames))

2021

## e. Count and list all the urls

Basic Rules and Assumptions

- `http`, `https`, `www` common starters for URLs usually
- Manually investigating all sentences which contain `http` shows no false positives that is all occurences of `http` correspond to links
- Manually investigating all sentences which contain `www` shows lots of false positives due to words like `aww`
- On manual inspection we find no sentences which contain `https`
- According to [Twitter's official Blog](https://help.twitter.com/en/using-twitter/url-shortener), Twitter uses a URL-Shortener which converts links to the form `t.co` however there are no positive matches for this in our dataset. The only matches that arise are spurious matches in links like `blogspot.com`
- A URL maybe as simple as `www.xyz.abc` and as complex as `http://www.xyz.abc/efg` and can get even more complex by adding / to index more indepth into pages
- There are false positives that we encounter such as `Gi.don` or `worry.we` but they are both valid URLs as well as there can be custom domains by those names. We assume the domain will be atleast 2 characters long to account for `.me`, `.uk` etc.
- Numbers have been allowed as [only numbers can form valid URLs](https://stackoverflow.com/q/56804936/13858953)

In [92]:
df_data['URLs'] = df_data['TEXT'].apply(lambda x: findURLs(x))
df_data['URLCounts'] = df_data['TEXT'].apply(lambda x: findURLCount(x))

In [93]:
print("URL Count Label 0 - " , df_data[df_data['LABEL'] == 0]['URLCounts'].sum())

URL Count Label 0 -  60


In [94]:
print("URL Count Label 1 - " , df_data[df_data['LABEL'] == 1]['URLCounts'].sum())

URL Count Label 1 -  145


## f. Count the number of tweets for each day of the week. Eg Mon: 58, Tues: 20, Wed...

In [95]:
df_data['Day'] = df_data['DATE_TIME'].apply(lambda x: getDay(x))

In [99]:
print("Day Counts Label 0")
df_data[df_data['LABEL'] == 0]['Day'].value_counts()

Day Counts Label 0


Sun    565
Fri    473
Mon    391
Thu    171
Tue    154
Wed    127
Sat    119
Name: Day, dtype: int64

In [100]:
print("Day Counts Label 1")
df_data[df_data['LABEL'] == 1]['Day'].value_counts()

Day Counts Label 1


Sun    763
Mon    481
Fri    391
Sat    298
Wed    172
Tue    132
Thu     50
Name: Day, dtype: int64

# B.

## a. Total number of occurrences of the given word and sentences containing that word.

In [101]:
def re_find_word_in_sentence(word, sentence):
  return len(re.findall(f'\\b{word}\\b', sentence))

In [102]:
def find_word_counts(df_data, word, class_label):
  filtered_df = df_data[df_data['LABEL'] == class_label]
  filtered_df['Counts'] = filtered_df['TEXT'].apply(lambda x: re_find_word_in_sentence(word, x))
  total_occurence = sum(filtered_df['Counts'])
  sentences_containing_word = filtered_df[filtered_df['Counts'] > 0]['TEXT'].to_list()
  return total_occurence, sentences_containing_word

In [103]:
total_occurence, sentences_containing_word = find_word_counts(df_data, 'i', 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Counts'] = filtered_df['TEXT'].apply(lambda x: re_find_word_in_sentence(word, x))


In [104]:
total_occurence

450

In [105]:
sentences_containing_word[:5]

["@buckhollywood I Cant Watch That i'm in the UK  Can you tell me what its about? Please x",
 '@mykiaisosm omj ur bad and mean i should not have meet u in 2nd grade even thouggh we hated each other i should have stayed like that ',
 ' i missed the game',
 "dr office... hopefully finding out why i've been so sick  it's so hard to keep my eyes open",
 'i feel like death...my next investment?going to the spa! i need a new body that can function  (via @IngaDurgin)i herd sleep is good 4 dat']

## b. Number of sentences starting with the given word.

In [106]:
def re_find_sentence_starting_with_word(word, sentence):
  if re.findall(f'^\s*{word}\\b', sentence):
    return 1
  return 0

In [107]:
re_find_sentence_starting_with_word("i", " am i "), re_find_sentence_starting_with_word("i", " i am i ")

(0, 1)

In [108]:
def find_all_sentences_starting_with_word(df_data, word, class_label):
  filtered_df = df_data[df_data['LABEL'] == class_label]
  filtered_df['Counts'] = filtered_df['TEXT'].apply(lambda x: re_find_sentence_starting_with_word(word, x))
  total_occurence = sum(filtered_df['Counts'])
  sentences_starting_with_word = filtered_df[filtered_df['Counts'] == 1]['TEXT'].to_list()
  return total_occurence, sentences_starting_with_word

In [109]:
total_occurence, sentences_starting_with_word = find_all_sentences_starting_with_word(df_data, 'i', 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Counts'] = filtered_df['TEXT'].apply(lambda x: re_find_sentence_starting_with_word(word, x))


In [110]:
total_occurence, sentences_starting_with_word

(52,
 [' i missed the game',
  'i feel like death...my next investment?going to the spa! i need a new body that can function  (via @IngaDurgin)i herd sleep is good 4 dat',
  "i bet i was mistaking.. nah i'm not surprised ",
  'i swear i just felt a earthquake  lol',
  'i need to go out soon   i dont wanna but this weight aint gonna shift its self is it lol x',
  "i'm eating chocolate covered pretzels which is reminding me of Mallrats and making me not want to eat them anymore. ",
  'i really miss photoshop ',
  'i hate it im not yet done to my homework!! ',
  "i have officially lost all feeling in my legs! playing the sims 3 for seven hours isn't good... ",
  "i'm gonna miss those dayss ",
  'i miss my feather duster ',
  'i wish i was watching The Hills ',
  'i miss @romylovesmcfly, anke and paula  i was going with them to tokio hotel.',
  'i feel really bad i just talked to my parents like they were my slaves, and they were being so nice to me  punish me.',
  "i #blamedrewscancer for

## c. Number of sentences ending with the given word.

In [111]:
def re_find_sentence_ending_with_word(word, sentence):
  if re.findall(f'\\b{word}\s*$', sentence):
    return 1
  return 0

In [112]:
re_find_sentence_ending_with_word("i", " am"), re_find_sentence_ending_with_word("i", " i am i")

(0, 1)

In [113]:
def find_all_sentences_ending_with_word(df_data, word, class_label):
  filtered_df = df_data[df_data['LABEL'] == class_label]
  filtered_df['Counts'] = filtered_df['TEXT'].apply(lambda x: re_find_sentence_ending_with_word(word, x))
  total_occurence = sum(filtered_df['Counts'])
  sentences_ending_with_word = filtered_df[filtered_df['Counts'] == 1]['TEXT'].to_list()
  return total_occurence, sentences_ending_with_word

In [114]:
total_occurence, sentences_ending_with_word = find_all_sentences_ending_with_word(df_data, 'scared', 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Counts'] = filtered_df['TEXT'].apply(lambda x: re_find_sentence_ending_with_word(word, x))


In [115]:
total_occurence, sentences_ending_with_word

(1, ['About to get threaded and scared '])