<a href="https://colab.research.google.com/github/asliakalin/ML/blob/master/text_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import sys, argparse
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import re
from collections import Counter
from collections import defaultdict

In [0]:
#This code gets the train/dev/test files from github and imports them into Colab
!wget https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/train.txt
!wget https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/dev.txt
!wget https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/test.txt.zip
!unzip test.txt.zip

--2020-02-11 09:14:40--  https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1427184 (1.4M) [text/plain]
Saving to: ‘train.txt’


2020-02-11 09:14:40 (10.5 MB/s) - ‘train.txt’ saved [1427184/1427184]

--2020-02-11 09:14:41--  https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/dev.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1474560 (1.4M) [text/plain]
Saving to: ‘dev.txt’


2020-02-11 09:14:42 (22.9 MB/s) - ‘dev.txt’ saved [1474560/1474560]

--2020-02-11 09:14

In [0]:
######################################################################
## Do not edit this block of code.
## This defines the dumb features the model starts with.
######################################################################
def dumb_featurize(text):
	feats = {}
	words = text.split(" ")
	for word in words:
		if word == "love" or word == "like" or word == "best":
			feats["contains_positive_word"] = 1
		if word == "hate" or word == "dislike" or word == "worst" or word == "awful":
			feats["contains_negative_word"] = 1
	return feats

In [0]:
######################################################################
## Do not edit this block of code.
## This defines the sentiment classification class which
## loads the data and sets up the model.
######################################################################

class SentimentClassifier:

	def __init__(self, feature_method):
		self.feature_vocab = {}
		self.feature_method = feature_method


	# Read data from file
	def load_data(self, filename):
		data = []
		with open(filename, encoding="utf8") as file:
			for line in file:
				cols = line.split("\t")
				label = cols[0]
				text = cols[1].rstrip()

				data.append((label, text))
		return data

	# Featurize entire dataset
	def featurize(self, data):
		featurized_data = []
		for label, text in data:
			feats = self.feature_method(text)
			featurized_data.append((label, feats))
		return featurized_data

	# Read dataset and returned featurized representation as sparse matrix + label array
	def process(self, dataFile, training = False):
		data = self.load_data(dataFile)
		data = self.featurize(data)

		if training:			
			fid = 0
			feature_doc_count = Counter()
			for label, feats in data:
				for feat in feats:
					feature_doc_count[feat]+= 1

			for feat in feature_doc_count:
				if feature_doc_count[feat] >= MIN_FEATURE_COUNT[self.feature_method.__name__]:
					self.feature_vocab[feat] = fid
					fid += 1

		F = len(self.feature_vocab)
		D = len(data)
		X = sparse.dok_matrix((D, F))
		Y = np.zeros(D)
		for idx, (label, feats) in enumerate(data):
			for feat in feats:
				if feat in self.feature_vocab:
					X[idx, self.feature_vocab[feat]] = feats[feat]
			Y[idx] = 1 if label == "pos" else 0

		return X, Y

	def load_test(self, dataFile):
		data = self.load_data(dataFile)
		data = self.featurize(data)

		F = len(self.feature_vocab)
		D = len(data)
		X = sparse.dok_matrix((D, F))
		Y = np.zeros(D, dtype = int)
		for idx, (data_id, feats) in enumerate(data):
			# print (data_id)
			for feat in feats:
				if feat in self.feature_vocab:
					X[idx, self.feature_vocab[feat]] = feats[feat]
			Y[idx] = data_id

		return X, Y

	# Train model and evaluate on held-out data
	def evaluate(self, trainX, trainY, devX, devY):
		(D,F) = trainX.shape
		self.log_reg = linear_model.LogisticRegression(C = L2_REGULARIZATION_STRENGTH[self.feature_method.__name__])	
		self.log_reg.fit(trainX, trainY)
		training_accuracy = self.log_reg.score(trainX, trainY)
		development_accuracy = self.log_reg.score(devX, devY)
		print("Method: %s, Features: %s, Train accuracy: %.3f, Dev accuracy: %.3f" % (self.feature_method.__name__, F, training_accuracy, development_accuracy))
		

	# Predict labels for new data
	def predict(self, testX, idsX):
		predX = self.log_reg.predict(testX)

		out = open("%s_%s" % (self.feature_method.__name__, "predictions.csv"), "w", encoding="utf8")
		out.write("Id,Expected\n")
		for idx, data_id in enumerate(testX):
			out.write("%s,%s\n" % (idsX[idx], int(predX[idx])))
		out.close()

	# Write learned parameters to file
	def printWeights(self):
		out = open("%s_%s" % (self.feature_method.__name__, "weights.txt"), "w", encoding="utf8")
		reverseVocab = [None]*len(self.feature_vocab)
		for feat in self.feature_vocab:
			reverseVocab[self.feature_vocab[feat]] = feat

		out.write("%.5f\t__BIAS__\n" % self.log_reg.intercept_)
		for (weight, feat) in sorted(zip(self.log_reg.coef_[0], reverseVocab)):
			out.write("%.5f\t%s\n" % (weight, feat))
		out.close()



In [0]:
def bag_of_words(text):
  word_bag = {}
	# Your code goes here
  words = re.split(r"\s+", text)
  for word in words:
    word = word.strip().lower()
    if word in word_bag:
      word_bag[word] += 1
    else:
      word_bag[word] = 1

  #for w, c in word_bag:
  #  if c > 10:
  #    word_bag.pop(w, None)
  return word_bag

In [0]:
dics = []
# Implement your fancy featurization here
def fancy_featurize(text):
  global dics
  features = {}
  features.update(lexicon_analysis(text))
  features.update(punct_analysis(text))
  features.update(rating_analysis(text))
  features.update(bag_of_words_test_data(text))
  dics.append(features)
  return features

In [0]:
#convert all apostrope negations is, are, would etc into separate words:
def clean_text(text):
  text = re.sub(r'(is\s*n\s*\'*t)', 'is not ', text)
  text = re.sub(r'(I\s*\'\s*m)', 'I am ', text)
  text = re.sub(r'you\s*\'\s*re', 'you are ', text)
  text =re.sub(r'he\s*\'\s*s', 'he is ', text)
  text =re.sub(r'she\s*\'\s*s', 'she is ', text)
  text =re.sub(r'we\s*\'\s*re', 'we are ', text)
  text =re.sub(r'it\s*\'\s*s', 'it is ', text)
  text =re.sub(r'are\s*n\s*\'\s*t', 'are not ', text)
  text =re.sub(r'they\s*\'\s*re', 'they are ', text)
  text =re.sub(r'there\s*\'\s*s', 'there is ', text)
  text =re.sub(r'was\s*n\s*\'\s*t', 'was not ', text)
  text =re.sub(r'were\s*n\s*\'\s*t', 'were not ', text)           
  text =re.sub(r'I\s*\'\s*ve', 'I have ', text)
  text =re.sub(r'you\s*\'\s*ve', 'you have ', text)
  text =re.sub(r'we\s*\'\s*ve', 'we have ', text)
  text =re.sub(r'they\s*\'\s*ve', 'they have ', text)
  text =re.sub(r'has\s*n\s*\'\s*t', 'has not ', text)
  text =re.sub(r'have\s*n\s*\'\s*t', 'have not ', text) 
  text =re.sub(r'I\s*\'\s*d', 'I had ', text)
  text =re.sub(r'you\s*\'\s*d', 'you had ', text)
  text =re.sub(r'he\s*\'\s*d', 'he had ', text)
  text =re.sub(r'she\s*\'\s*d', 'she had ', text)
  text =re.sub(r'it\s*\'\s*d', 'it had ', text)
  text =re.sub(r'we\s*\'\s*d', 'we had ', text)
  text =re.sub(r'they\s*\'\s*d', 'they had ', text)
  text =re.sub(r'does\s*n\s*\'\s*t', 'does not ', text) 
  text =re.sub(r'do\s*n\s*\'\s*t', 'do not ', text)
  text =re.sub(r'did\s*n\s*\'\s*t', 'did not ', text)
  text =re.sub(r'I\s*\'\s*ll', 'I will ', text)
  text =re.sub(r'you\s*\'\s*ll', 'you will ', text)
  text =re.sub(r'he\s*\'\s*ll', 'he will ', text)
  text =re.sub(r'she\s*\'\s*ll', 'she will ', text)
  text =re.sub(r'we\s*\'\s*ll', 'we will ', text)
  text =re.sub(r'they\s*\'\s*ll', 'they will ', text)
  text =re.sub(r'there\s*\'\s*ll', 'there will ', text)
  text =re.sub(r'there\s*\'\s*d', 'there had ', text)
  text =re.sub(r'ca\s*n\s*\'\s*t', 'can not ', text)
  text =re.sub(r'can\s*not', 'can not ', text)
  text =re.sub(r'could\s*n\s*\'\s*t', 'could not ', text)
  text =re.sub(r'dare\s*n\s*\'\s*t', 'dare not ', text)
  text =re.sub(r'had\s*n\s*\'\s*t', 'had not ', text)
  text =re.sub(r'might\s*n\s*\'\s*t', 'might not ', text)
  text =re.sub(r'must\s*n\s*\'\s*t', 'must not ', text)
  text =re.sub(r'need\s*n\s*\'\s*t', 'need not ', text)
  text =re.sub(r'ought\s*n\s*\'\s*t', 'ought not ', text)
  text =re.sub(r'sha\s*n\s*\'\s*t', 'shall not ', text)
  text =re.sub(r'should\s*n\s*\'\s*t', 'should not ', text)
  text =re.sub(r'used\s*n\s*\'\s*t', 'used not ', text)
  text =re.sub(r'wo\s*n\s*\'\s*t', 'will not ', text)
  text =re.sub(r'would\s*n\s*\'\s*t', 'would not ', text)
  text= re.sub(r'b\s*\/\s*c', 'because ', text)
  text =re.sub(r'\s+\'\s*s', ' is ', text)
  text =re.sub(r'\s+u\s+', ' you ', text)
  text =re.sub(r'\s+dr\s+', ' doctor ', text)
  text = re.sub(r"(w+o+a*w+|w+o+a+h*)", 'wow ', text)
  text = re.sub(r"sci[\s-]+fi", ' scifi ', text)
  text = re.sub(r"wanna", ' want to ', text)
  text = re.sub(r"\s+'\s*re", ' are ', text)
  text = re.sub(r"\s+'\s*ll", ' will ', text)
  return text
                     

In [0]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [0]:
# stop words set up 
stop = "1 2 3 4 5 6 7 8 9 10 11 a re about above after again against all i am an and any are is s as at be because been before being below between both but by can could d"
stop += "did do does doing down during each few for from further had hadn't has have having he he'd he'll he's her here here's hers herself him himself his how how's i i'd"
stop += "i'll i'm i've if in into is it it's its itself m me more most my myself of on once only or other ought our ours ourselves out over own same she she'd she'll she's" 
stop += "some that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to under until up us ve we we'd we'll"
stop +=  "we're we've were what what's when when's where where's which while who who's whom why why's with you you'd you'll you're you've your yours yourself yourselves"
ignore = {}
for w in stop.split(" "):
  ignore[w.strip()] = 0
#print(ignore)

In [0]:
from google.colab import drive
drive.mount('/content/drive')
# subjectivity lexicon set up
f = open('/content/drive/My Drive/Colab Notebooks/subj_clues_lib.txt', 'r')
lex = {}
for line in f:
  word = line.split(" ")
  if word[0] == 'type=weaksubj':
    strength = 'w'
  else: 
    strength = 's'
  
  if word[5].strip() == 'priorpolarity=negative':
    pos = 0
    lex[word[2][6:]] = [strength, pos]
  elif word[5].strip() == 'priorpolarity=positive':
    pos = 1
    lex[word[2][6:].strip()] = [strength, pos]


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# /10 and /5 ratios mentioned in the reviews
def convert(val):
  if val == None:
    return None
  nums = {"zero": 0, "one": 1, "two": 2, "three":3, "four":4, "five":5, "six":6, "seven":7, "eight":8, "nine":9, "ten":10}
  if val in nums:
    return nums[val]
  elif is_number(val):
    return float(val)

def rating_analysis(text):
  rate = {}
  rates = re.findall(r"(\d\.?\d*\/10|\d\.?\d*\/5)", text)
  also = re.findall(r"\w+\s+out\s+of\s+10", text)
  last = re.findall(r"\w+\s+out\s+of\s+ten", text)
  if len(rates)==0:
    return rate
  calc = [float(r.split("/")[0])/float(r.split("/")[1]) for r in rates]
  calc2 = [convert(str(r.split("out of 10")[0].strip()))/10 for r in also if convert(str(r.split("out of 10")[0].strip()))!=None]
  calc3 = [convert(str(r.split("out of ten")[0].strip()))/10 for r in last if convert(str(r.split("out of ten")[0].strip()))!= None]

  tot = sum(calc) + sum(calc2) + sum(calc3)
  size = len(calc) + len(calc2) + len(calc3)

  rate["given_rate_avg"] = (tot/size)*10
  return rate

In [0]:
# Adds the bag of words representation of the text to feats
def bag_of_words_test_data(text):
  word_bag = {}
  text = clean_text(text.lower())
	# Your code goes here
  words = re.split(r"\W+", text)
  for word in words:
    word = word.strip().lower()
    if (word not in ignore) and (not is_number(word)) and (word != ""): 
      if word in word_bag:
        word_bag[word] += 1
      else:
        word_bag[word] = 1

  return word_bag

In [0]:

#emojis :(  ): (:  :) ;).
#punctuation marks number, ?? ?! !! ** @# ... (+) 
#Examples "f#@k!ng" "w@nk" "F * * *in" "..."
#hmm "af" "haha" "hahha" "Ah hah" "Ah - ha"
def punct_analysis(text):
  info = {}
  info["all_punc"] = len(re.findall(r"[\!\"#\$%&\'\(\)\*+,\-\.\/:;<=>?@\[\\\]\^_`{|}~]", text))
  info["bad_words"] = len(re.findall(r"[a-zA-z]+[@#*%\^\*!$&\-]+[a-z]+[\-%\^@#\*!$&a-zA-z]*", text)) # e.g s@!t, f@#k, f*ck f@#k S@#t w@nk f#@k!ng
  info["asts_total"] = len(re.findall(r"\**\s*\*+[\s*]*", text))           # **, *, * * * * usually emphasis or spoiler alerts
          
  info["excl_blocks"] = len(re.findall(r"\!+\s*\!+[\s!1]*", text))   # e.g !!!!!!!!!, ! ! ! !!, !! ! 11 !
  info["excl_total"] = len(re.findall(r"\!",text))                  # TOTAL !s used
  info["excl_sentences"] = len(re.findall(r"\s*\!+[\s!1]*", text))  # single and block uses of !

  info["q_blocks"] = len(re.findall(r"\?+\s*\?+[\s?]*", text))  # e.g ????, ?? ? ?
  info["q_total"] = len(re.findall(r"\?",text))                 # TOTAL ? used
  info["q_sentences"] = len(re.findall(r"\s*\?+[\s?]*", text))  # single and block uses of ?

  info["mixed_total"] =len(re.findall(r"\?+\s*\!+|\!+\s*\?+",text)) # TOTAL ?! combos used

  info["long_dots"] = len(re.findall(r"\.+\s*\.+[\s.]*", text))     # e.g '... ', '.. ', '........... ', '.. . . . . . . .... '
  info["single_dot_sentences"] = len(re.findall(r"(\w\.(?!\s*\.)|\w\s+\.(?!\.))", text)) #only the sentences that end with a single .
  info["dots_total"] = len(re.findall(r"\.", text))

  info["sad_faces"] = len(re.findall(r"(\)+-?:|:-?\(+|://(?!www)|:/(?!/)|/+:|:'\(+|\)+':)", text)) # TOTAL :( or ): or :/ or ://(not www) or /: or //: or :(( or )): or :'( or  )':
  info["happy_kiss_wink_face"] = len(re.findall(r"(\*+:|:\*+|\(+;|;\)+|:-?\)+|\(+-?:|:-?D+|:d+|:'\)+|\(+':)", text)) 
  # TOTAL :* or *: ;) (; (: or :) or :D or :d or :)) or ((: or :') or  (':
  
  info["wows_total"] = len(re.findall(r"(\s+w+o+a*w+|\s+w+o+a*h*)",text)) #TOTAL wow, woa, woah, woooo*w
  info["omgs_total"] = len(re.findall(r"(\s+o+m+g+|\s+o+h+\s+m+y+\s+g+o+d+)",text)) #TOTAL omg, oh my go*d
  info["laughs_total"] = len(re.findall(r"[ah\s]+",text)) #TOTAL number of laughter haha hahha
  info["suprise_thinking"]= len(re.findall(r"(\s+o+h+|\s+h+mm+)",text)) # oh and hmm
  info["wtf_disgust"] = len(re.findall(r"w+t+f+", text)) + len(re.findall(r"wha+t\s+the+\s+f\s*udge", text)) + len(re.findall(r"wha+t\s+the+\s+he...", text)) + \
                        len(re.findall(r"wha+t\s+the+\s+fu?[\s@#*%\^\*!$&]", text)) # wtf, wtf+, what the fuck, what the hell/heck, what the f[..]k 
  
  info["anger_points"] = info["q_blocks"]*2+ info["bad_words"]*3 + info["sad_faces"] + info["wtf_disgust"]*3 + info["long_dots"]*2 + info["mixed_total"]*2
  info["happy_points"] = info["wows_total"]*5 + info["laughs_total"]*5 + info["happy_kiss_wink_face"]*5 + info["excl_blocks"]*10


  return info

In [0]:
# 1. Adds the sentiment lexicon counts of the text to feats
# 2. Count capitalized words
def lexicon_analysis(text):
  lexicons = {}
	# Your code goes here
  words = text.split(" ")
  pos_count = 0
  pos_strong = 0
  pos_uniq = 0
  neg_count = 0
  neg_strong = 0
  neg_uniq = 0
  tot = 0
  utot = 0
  capitalized = 0
  
  for word in words:
    uniq = set()
    tot += 1
    if word == word.upper():
      capitalized += 1
    word = word.strip().lower()
    if word in lex:
      if lex[word][1] == 1: # positive word
        pos_count += 1
        if lex[word][0] == 's':
          pos_strong += 1
        if word not in uniq:
          utot += 1
          pos_uniq += 1
          uniq.add(word)
      else:                 # negative word
        neg_count += 1
        if lex[word][0] == 's':
          neg_strong += 1
        if word not in uniq:
          utot += 1
          neg_uniq += 1
          uniq.add(word)
    if word not in uniq:
          utot += 1
          uniq.add(word)

  lexicons["pos_sentiment_point"] = pos_strong*10 + (pos_count-pos_strong)*5
  lexicons['pos_strong'] = pos_strong
  lexicons['pos_neg_ratio'] = pos_count/neg_count if neg_count != 0 else pos_count
  lexicons['pos_neg_diff'] = pos_count-neg_count
  lexicons['pos_uniq_tot'] = (pos_uniq/utot)*100
  lexicons['pos_count'] = pos_count
  
  lexicons['caps_total'] = capitalized

  lexicons['neg_count'] = neg_count
  lexicons['neg_uniq'] = neg_uniq/neg_count if neg_count != 0 else neg_uniq
  lexicons['neg_strong'] = neg_strong
  lexicons["neg_sentiment_point"] = neg_strong*10 + (neg_count-neg_strong)*5
  lexicons["points_ratio"] = (lexicons["pos_sentiment_point"]/lexicons["neg_sentiment_point"])*10 if lexicons["neg_sentiment_point"]!=0 else lexicons["pos_sentiment_point"]

  return lexicons

In [0]:
# regularization strength to control overfitting (values closer to 0  = stronger regularization)
L2_REGULARIZATION_STRENGTH = {"dumb_featurize": 1, "fancy_featurize": 0.1}

# must observe feature at least this many times in training data to include in model
MIN_FEATURE_COUNT = {"dumb_featurize": 10,  "fancy_featurize":10 }

if __name__ == "__main__":
  trainingFile = "./train.txt"
  evaluationFile = "./dev.txt"
  testFile = "./test.txt"

  for feature_method in [dumb_featurize, fancy_featurize]:
    sentiment_classifier = SentimentClassifier(feature_method)
    trainX, trainY = sentiment_classifier.process(trainingFile, training=True)
    devX, devY = sentiment_classifier.process(evaluationFile, training=False)
    testX, idsX = sentiment_classifier.load_test(testFile)
    sentiment_classifier.evaluate(trainX, trainY, devX, devY)
    sentiment_classifier.printWeights()
    sentiment_classifier.predict(testX, idsX)


Method: dumb_featurize, Features: 2, Train accuracy: 0.604, Dev accuracy: 0.611


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Method: fancy_featurize, Features: 2007, Train accuracy: 0.824, Dev accuracy: 0.767


In [0]:
import pandas as dp