-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment.py
90 lines (74 loc) · 2.2 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 18 11:31:52 2016
@author: adebayo
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
import conn
from stop_words import get_stop_words
sns.set(color_codes=True)
reload(conn)
#Processing
def processing(tweet):
'''
Convert the tweets to the lowercase
Convert any www.* or https?://* to url
Convert @username to user
Remove additional white spaces
Replaces #word with word
trim the tweet
'''
if tweet!= None:
tweet = tweet.encode('utf8')
tweet = tweet.lower()
regex = '((www\.[^\s]+)|(https?://[^\s]+))'
regex2 = '@[^\s]+'
regex3 = '[\s]+'
regex4 = r'#([^\s]+)'
tweet = re.sub(regex, 'url', tweet)
tweet = re.sub(regex2, 'at_user', tweet)
tweet = re.sub(regex3, ' ', tweet)
tweet = re.sub(regex4, r'\1', tweet)
tweet = tweet.strip('\'"')
return tweet
# load the tweet andprocesses
# - filtering tweets words for teature vector
stopWords = []
#Replace functions
def replacefn(char):
'''
replace the repetitive characters with itseft
'''
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1",char)
def stopWord():
'''These words don not indicate any sentiment and can be removed
Repeating letter e.g hungrryyy for hungry
Punctuation
'''
stopWords = get_stop_words('en')
stopWords.append('at_user')
stopWords.append('url')
return stopWords
def featureVector(tweet):
featureVectorList = []
regex = r'^[a-zA-Z][a-zA-Z0-9]*$'
for char in tweet:
if tweet is not None:
char = tweet.split()
char = replacefn(char).strip('\'"?,.')
#chech if the word starts with an alphabet
alphebet = re.search(regex, char)
if char not in stopWord() or alphebet is not None:
featureVectorList.append(char.lower())
return featureVectorList
df = conn.createDataframe()
k = df['text'].map(lambda x:processing(x))
for num in np.arange(len(k)):
x = k.iloc[num]
print featureVector(x)
#print df['tweet'].map(lambda x: featureVector(x))