# Exercise Subjectivity Mining: lexicon-based approach

### Import packages

In [1]:
import pandas as pd

### Load data

In [7]:
# Define file paths
lexicon_wiegand_path = "baseLexicon.txt" 
lexicon_hurtlex_path = "hurtlex_EN.tsv" 
lexicon_mol_path = "mol.csv" 
test_set_path =  "Subjectivity_mining_assignment_3_4_data/olid-test.csv" 

In [8]:
# Load datasets in frames
df_wiegand = pd.read_csv(lexicon_wiegand_path, delimiter='\t', header=None) 
df_hurtlex = pd.read_csv(lexicon_hurtlex_path, delimiter='\t')
df_mol = pd.read_csv(lexicon_mol_path)
df_test = pd.read_csv(test_set_path)

### Create merged lexicon

In [9]:
# Format Wiegand lexicon
df_wiegand.columns = ['Word', 'Hateful'] 
df_wiegand['Word'] = df_wiegand['Word'].str.split('_').str.get(0)
df_wiegand_hateful = df_wiegand[df_wiegand['Hateful'] == True]

# Drop NaNs for MOL lexicon
df_mol.dropna(subset=['en-american-english'], inplace=True)
df_mol = df_mol[df_mol['en-american-english'] != '0']

In [10]:
# Merge lexicons and drop duplicates
df_merged = pd.concat([df_wiegand_hateful['Word'], df_mol['en-american-english'], df_hurtlex['lemma']], ignore_index=True)
df_merged.drop_duplicates(inplace=True)
df_merged.reset_index(drop=True, inplace=True)

### Create lexicons

In [11]:
wiegand_lexicon = set(df_wiegand_hateful['Word'])
hurtlex_lexicon = set(df_hurtlex['lemma'])
mol_lexicon = set(df_mol['en-american-english'])
merged_lexicon = set(df_merged)

### Run lexicon-based analysis

In [12]:
# Function checks whether word from lexicon is present in string
def check_words(input_str, lexicon):
    return any(word in input_str for word in lexicon)


In [13]:
# Run analysis by checking if words in strings are present in lexicons
df_analysis = df_test.copy()
df_analysis['label wiegand'] = df_analysis['text'].apply(lambda x: check_words(x, wiegand_lexicon))
df_analysis['label hurtlex'] = df_analysis['text'].apply(lambda x: check_words(x, hurtlex_lexicon))
df_analysis['label mol'] = df_analysis['text'].apply(lambda x: check_words(x, mol_lexicon))
df_analysis['label merged'] = df_analysis['text'].apply(lambda x: check_words(x, merged_lexicon))

In [14]:
df_analysis

Unnamed: 0,id,text,labels,label wiegand,label hurtlex,label mol,label merged
0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,1,True,True,True,True
1,27014,"#ConstitutionDay is revered by Conservatives, ...",0,True,True,True,True
2,30530,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,0,False,True,True,True
3,13876,#Watching #Boomer getting the news that she is...,0,False,False,False,False
4,60133,#NoPasaran: Unity demo to oppose the far-right...,1,False,True,True,True
...,...,...,...,...,...,...,...
855,73439,#DespicableDems lie again about rifles. Dem Di...,1,True,True,True,True
856,25657,#MeetTheSpeakers 🙌 @USER will present in our e...,0,False,True,True,True
857,67018,3 people just unfollowed me for talking about ...,1,True,True,True,True
858,50665,#WednesdayWisdom Antifa calls the right fascis...,0,False,True,True,True
