# Twitter Opinion Mining


In [1]:
# import the different modules
from buildDatabase import buildDatabse
from makeHTnetwork import makeHTNetwork
from selectInitialHashtags import selectInitialHashtags
from propagateLabels import propagateLabels
from addStatSigniHT import addStatSigniHT
from selectHashtags import selectHashtags
from updateHTGroups import updateHTGroups
from buildTrainingSet import buildTrainingSet
from crossValOptimize import crossValOptimize
from trainClassifier import trainClassifier
from classifyTweets import classifyTweets
from makeProbaDF import makeProbaDF
from analyzeProbaDF import analyzeProbaDF

ImportError: No module named 'ds'

### Define filenames and directories for current job

In [2]:
# list of directories containing the tweet archive files (TAJ)
tweet_archive_dirs = ['etrade']

# SQLite database that will be created
sqlite_file = 'test.sqlite'

# hashtag co-occurrence graph that will be created
graph_file = 'graph_file.graphml'

# pickle files where the training set features will be saved
features_pickle_file = 'features.pickle'

# pickle file where the training set labels will be saved
labels_pickle_file = 'labels.pickle'

# vectorized features file
features_vect_file = 'features.mmap'

# vectorized labels file
labels_vect_file = 'labels.mmap'

# mapping between labels names and numbers
labels_mappers_file = 'labels_mappers.pickle'

# JSON file with the classifier best parameters obtained from cross-validation
best_params_file = 'best_params.json'

# where the trained calssifier will be saved
classifier_filename = 'classifier.pickle'

# DataFrame with the results of the label propagation
# on the hashtag network
propag_results_filename = 'propag_results.pickle'

# DataFrame with the classification probability of
# every tweets in the database
df_proba_filename = 'df_proba.pickle'

# DataFrame with the number of tweets in each camp per day
df_num_tweets_filename = 'df_num_tweets.pickle'

# DataFrame with the number of users in each camp per day
df_num_users_filename = 'df_num_users.pickle'

In [3]:
# all the parameters are saved in this dictionary 
# that will be passed to the differetn modules
job = {'tweet_archive_dirs': tweet_archive_dirs,
       'sqlite_db_filename' : sqlite_file,
       'graph_file' : graph_file,
       'propag_results_filename' : propag_results_filename,
      'features_pickle_file': features_pickle_file,
      'labels_pickle_file': labels_pickle_file,
      'features_vect_file': features_vect_file,
      'labels_vect_file': labels_vect_file,
      'labels_mappers_file' : labels_mappers_file,
       'classifier_filename':classifier_filename,
       'df_proba_filename':df_proba_filename,
       'df_num_tweets_filename': df_num_tweets_filename,
       'df_num_users_filename': df_num_users_filename,
       'best_params_file' : best_params_file
       }

### 1. Build the SQLite database with the extracted info from the tweets
Read the tweets from all the .taj files in the directories `tweet_archive_dirs`
and add them to the database `sqlite_db_filename`.

In [4]:
buildDatabse(job).run()

Dropping index sqlite_autoindex_hashtag_1
index associated with UNIQUE or PRIMARY KEY constraint cannot be dropped
Dropping index sqlite_autoindex_hashtag_tweet_user_1
index associated with UNIQUE or PRIMARY KEY constraint cannot be dropped
Dropping index sqlite_autoindex_tweet_to_keyword_1
index associated with UNIQUE or PRIMARY KEY constraint cannot be dropped
Dropping index sqlite_autoindex_tweet_to_mentioned_uid_1
index associated with UNIQUE or PRIMARY KEY constraint cannot be dropped
Dropping index sqlite_autoindex_filename_1
index associated with UNIQUE or PRIMARY KEY constraint cannot be dropped
Dropping index sqlite_autoindex_query_1
index associated with UNIQUE or PRIMARY KEY constraint cannot be dropped
Dropping index sqlite_autoindex_source_content_1
index associated with UNIQUE or PRIMARY KEY constraint cannot be dropped
Dropping index sqlite_autoindex_source_url_1
index associated with UNIQUE or PRIMARY KEY constraint cannot be dropped
Dropping index sqlite_autoindex_twee

### 2.  Make the Hashtag co-occurrences network
Reads all the co-occurences from the SQLite database and builds the network
of where nodes are hashtags and edges are co-occurrences.
The graph is a [*graph-tool*](https://graph-tool.skewed.de/) object and is saved in graphml format to `graph_file`.

Nodes of the graph have two properties: `counts` is the number of single occurrences of the hashtag and `name` is the name of the hashtag.

Edges have a property `weights` equal to the number of co-occurrences they represent.

The graph has the following properties saved with it:
- `Ntweets`: number of tweets with at least one hashtag used to build the graph.
- `start_date` : date of the first tweet.
- `stop_date` : date of the last tweet.
- `weight_threshold` : co-occurrence threshold. Edges with less than `weight_threshold` co-occurrences are discarded.

*Optional parameters that can be added to `job`:*
- `start_date` and `stop_date` to specify a time range for the tweets. (Default is `None`, i.e. select all the tweets in the database).
- `weight_threshold` is the minimum number of co-occurences between to hashtag to be included in the graph. (Default is 3).

To add a parameter to job, simply execute `job["parameter name"] = parameter value`.


In [5]:
makeHTNetwork(job).run()

creating edge list
*** took 1.912s
creating graph
*** took 0.004421s


### 3. Add statistical significance value to edges
Adds a property `s` to edges of the graph corresponding to the statistical significance (`s = log10(p_0/p)`)
of the co-occurence computed from a null model[1].
The computation is done using `p0=1e-6` and `p0` is saved as a graph property.
Different values of `p0` can be latter tested by shifting `s`.
The resulting graph is saved to `graph_file`.

*Optional parameters that can be added to `job`:*
- `ncpu` : number of processors to be used. (Default is 6).


[1] Martinez-Romo, J. et al. Disentangling categorical relationships through a graph of co-occurrences. Phys. Rev. E 84, 1–8 (2011).


In [6]:
addStatSigniHT(job).run()

computing significance of links


[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   13.5s


finished
*** took 37.58s


[Parallel(n_jobs=6)]: Done 351 out of 351 | elapsed:   37.4s finished


### 4. Select the initial hashtags to start the propagation
This will display to top occurring hashtags.

*Optional parameters that can be added to `job`:*
- `num_top_htgs` : (Default is top 100).

In [7]:
selectInitialHashtags(job).run()

 Top 100 occuring hashtags:
* rank: (name: frequency)
0 :('finance', 8162)
1 :('etrade', 8028)
2 :('tradeking', 7663)
3 :('money', 7436)
4 :('stock', 4169)
5 :('401k', 4141)
6 :('amtd', 4141)
7 :('alerts', 4141)
8 :('stocks', 3937)
9 :('stockmarket', 3915)
10 :('cash', 3911)
11 :('market', 3523)
12 :('ameritrade', 3522)
13 :('scottrade', 3522)
14 :('mortgage', 714)
15 :('rates', 352)
16 :('didyouknow', 210)
17 :('loan', 94)
18 :('loans', 76)
19 :('seattle', 70)
20 :('history', 70)
21 :('interest', 70)
22 :('canadian', 68)
23 :('motors', 58)
24 :('kia', 58)
25 :('hiring', 49)
26 :('year', 47)
27 :('house', 45)
28 :('news', 45)
29 :('a', 44)
30 :('get', 44)
31 :('major', 40)
32 :('rate', 39)
33 :('refinancing', 38)
34 :('boycottcnn', 36)
35 :('calculator', 35)
36 :('lenders', 35)
37 :('daytrading', 32)
38 :('stocktrader', 31)
39 :('trump', 30)
40 :('vermont', 30)
41 :('2nd', 29)
42 :('mortgages', 29)
43 :('estimate', 29)
44 :('calcu', 27)
45 :('ecommerce', 26)
46 :('jobsearch', 26)
47 :(

Select seeds hashtags you want to use from the list (minimum two) 
and add them to the `job` dictionary with the key `initial_htgs_lists`:

In [8]:
# initial_htgs_lists is a list of list with hashtags seeds for each camp:
job['initial_htgs_lists'] = [['money'],
                             ['401k']]

### 5. Propagate labels to neighboring hashtags
This part can be looped by updating the `htgs_lists` in `job` with the result of the label propagation to reach a larger number of hashtags.

In [9]:
# start with the hashtag seeds selected above.
job['htgs_lists'] = job['initial_htgs_lists']

The loop has two steps:
1. `propagateLabels` uses the graph from `graph_file` and the initial hashtags from `initial_htgs_lists` to propagate their labels to their neighbors taking into account the statistical significance of edges. The results are saved in a pandas DataFrame in `propag_results_filename`.
    - *Optional parameters that can be added to `job`:*
        - `count_ratio` : threshold, $r$, for removing hashtags with a number of single occurrences smaller than $r \max\limits_{v_j\in C_k} c_j$ where $c_i$ is the number of occurrences of the hashtag associated with vertex $v_i$, $C_k$ is the class to which $v_i$ belong. (Default = 0.001).
        - `p0` : significance threshold. to keep only edges with p_val <= p0. (Default = 1e-5).

2. Visualisation of the results using `selectHashtags`, and updating the `initial_htgs_lists` list. This will print a list of hashtags, $i$, for each camp $C_k$ satisfying: $\sum_{j \in C_k} s_{ij} > \sum_{j \in C_l} s_{ij}$, where $C_l$ represents all the other camps than $C_k$.
    - *Optional parameters that can be added to `job`:*
        - `num_top_htgs` : number of top hashtags to be displayed in each camp. (Default is 100).

In [13]:
# 1st step of the loop:
propagateLabels(job).run()

Propagating labels
saving results


In [14]:
# 2nd step of the loop:
selectHashtags(job).run()
# the signification of the displayed columns are:
# count (= total number of occurrences),
# label_init (= initial label before propagation, -1 means no initial labels)
# vertex_id  (= ID of the vertex in the hashtag graph)
# label_sum1 (= number of neighbors with label 1)
# signi_sum1 (= sum of the significance of edges with neighbors having label 1)
# label_sum2 (= number of neighbors with label 2)
# signi_sum2 (= sum of the significance of edges with neighbors having label 2)


 +++ hashtags in camp 2
           name  count  label_init  vertex_id  label_sum2    signi_sum2  \
5       finance   8162           2          5         4.0   4527.106262   
0        etrade   8028           2          0         5.0   8079.589318   
64    tradeking   7663           2         64         5.0   8517.992058   
52        stock   4169           2         52         6.0  14079.147091   
60         401k   4141           2         60         6.0  14382.367984   
66         amtd   4141           2         66         6.0  14382.367984   
78       alerts   4141           2         78         6.0  14382.367984   
44       motors     58          -1         44         1.0      7.655783   
54          kia     58          -1         54         1.0      7.655783   
98        major     40          -1         98         1.0      3.720529   
71   daytrading     32          -1         71         1.0      0.846667   
72  stocktrader     31          -1         72         1.0      1.978751   


In [12]:
# you can now update the hashtag list and return the 1st step.
job['htgs_lists'] = [['money', 'stocks', 'stockmarket', 'cash', 'market', 'ameritrade', 'scottrade'],
               ['401k', 'finance', 'etrade', 'tradeking', 'stock', 'amtd', 'alerts']]

### 6.  update HT group in database



In [None]:
updateHTGroups(job).run()