## TwitterProcessor

### Set environmental variables

In order to properly load modules within this notebook from outside the repository folder, set the script **PATH** below,  e.g. ```C:/TwitterProcessor```:

In [None]:
PATH = "/media/data/scripts/chn@git/chn-tools/tools/TwitterProcessor" # <-- optional if running from native path

In [None]:
import importlib.util, os

if not os.path.isdir(PATH):
    PATH = os.getcwd()
PATH = os.path.realpath(PATH)

spec = importlib.util.spec_from_file_location("__init__", PATH+'/__init__.py')
init = importlib.util.module_from_spec(spec)
spec.loader.exec_module(init)

%matplotlib inline
%load_ext autoreload
%autoreload 2

### Import functions

In [None]:
import plotly.offline as py

from tools import TwitterProcessor

from tools.DataFrames.dflib import *
from tools.DataFrames.filter import *
from tools.TwitterProcessor.TweetParser import *
from tools.TwitterProcessor.worldmap import *

In [None]:
py.init_notebook_mode(connected=True)

### Load tweets data frame

In [None]:
tweets = ""

In [None]:
#tweets = tweets_load(file_name) # <-- ensure older format compability

#### Select specific interval to filter data frame (optional)

In [None]:
min_date = "1970-01-01"
max_date = "2038-01-18 03:14:07"

In [None]:
tweets = df_filter_timestamp(tweets, min_date, max_date, column="timestamp")

#### Select only tweets that match a text filter rule (optional)

In [None]:
text = "trump|Trump"

In [None]:
tweets = df_filter_text(tweets, text, column="tweet_text")

### Parse tweets and generate output data

In [None]:
tweets_parse(tweets, stop_words='english', output='RESULTS')

#### Choropleth world map

Accepted format for `country_code` is 3-letters long by default. **Tip:** uncomment `line 7` below to enable writing to `wordmap.html`.

In [None]:
locations = 'RESULTS/locations.csv'

df = df_worldmap(locations) # leave blank to check

plot_worldmap(df,
              name='Worldmap',
              #output='worldmap.html',
              inline=True,
              auto_open=False)

#### Compress output →  `output.zip`

In [None]:
!zip output.zip/*{csv,xls,xlsx,png,html}

### [Download output files](output.zip)

___

#### Calculate metrics indices `JS` `LEGACY`

In [None]:
sqlnode = abspath(PATH+'/tools/TwitterClusterJs')
tweets  = sqlnode_tweets(tweets, sqlnode=sqlnode)

#### Start TwitterProcessor `LEGACY`

In [None]:
ta, tg, ml, tp = TwitterProcessor.__init__()
# ta = TweetAnalytics()
# tg = TweetGraph()
# ml = TweetML()
# tp = TweetProcessor()

#### Generate preformatted analytics report `LEGACY`

In [None]:
sent_model = abspath(PATH+'/tools/TwitterProcessor/twitter_sentiment_model.h5')
word2vec = abspath(PATH+'/tools/TwitterProcessor/word2vec_twitter_model.bin')

if os.path.isfile(sent_model): 
    # load pretrained sentiment model
    ml.load_sentiment_model(sent_model)

if os.path.isfile(word2vec):
    # load pretrained word embedding
    tp.load_word_embedding_from_file(word2vec)
else: # train new word embedding
    tp.trainWordEmbedding(tweets)

In [None]:
# create new topic model
ml.create_topic_model_LDA(tweets,
                          tp, # TweetProcessor
                          num_topics=5,
                          extra_stopwords=[])

# append topic to tweet time slice
tp.add_topic(tweets, ml)

# append sentiment to tweet time slice
tp.add_sentiment(tweets, ml)

In [None]:
# write report to file
ta.write_report(tweets,
                ml, # TweetML
                'report.xls',
                num_lines_per_topic=100)

#### Tweet topic modeling `LEGACY`

In [None]:
text = 'insert_text_here'

In [None]:
# get topics
ml.get_LDA_topics()

# predict topics
ml.predict_LDA_topic(text, tp)

# find clusters
topic_model = ml.findTopicClusters(tp.word_embedding_model, n_clusters=5)

# sort topic model
sorted_topic_model = ml.sortTopicModel(topic_model)

# print to examine
ml.printTopicModel(sorted_topic_model)

#### Write to output `LEGACY`

In [None]:
# define topic cluster
topic_number = 0

# find tweets from a specific cluster
topic_specific_tweet_list = ml.findTweetsTopicSpecific(topic_model[0][topic_number], tweet_data)

# write tweets of a specific topic to CSV
ml.printTweetsTopicSpecificToCSV(topic_specific_tweet_list, file_path='tweet_specific_topics.csv')

___________________________________

In [None]:
# ta = TweetAnalytics()
# ta.all_tweet_text(tweet_data, topic_number=0)
# ta.sentiment_per_topic(tweet_data)
# ta.top_by_group_by_topic(tweet_data, by='retweets_plus_favorites', topic_number=0, num_influencers=5)
# ta.top_media_by_topic(tweet_data, topic_number=0)
# ta.top_topics_by_count(tweet_data)
# ta.unique_tweet_text(tweet_data, topic_number=0)
# ta.write_report(tweet_data, ml, filename='./ta_report.xls', num_lines_per_topic=5)

In [None]:
# ml = TweetML()
# ml.create_topic_model_LDA(tweet_data, tweet_preprocessor, num_topics=5, extra_stopwords=[])
# ml.findOptimalNumberClusters(model, num_clusters_to_try)
# ml.findTopicClusters(model, n_clusters)
# ml.findTweetsTopicSpecific(topic_list, tweet_data, sort=True)
# ml.flatten_2D_trainingset(X)
# ml.get_LDA_topics()
# ml.groupSentenceIntoTopicClusters(sentence, word_clusters_dict, tp)
# ml.load_sentiment_model(filepath='twitter_sentiment_model.h5')
# ml.neuralNetModel_Conv_Flattened()
# ml.neuralNetModel_Conv_Sequential_1D()
# ml.plotTopicClusters(word2vec_model)
# ml.predict_LDA_topic(tweet_text, tweet_preprocessor)
# ml.predict_sentiment(tweet_text, tweet_preprocessor)
# ml.printTopicClusterSize(word_clusters_list)
# ml.printTopicModel(topic_model)
# ml.printTweetsTopicSpecificToCSV(tweet_list_topic_specific, file_path='topic_specific_tweet_list.csv')
# ml.sentiment_model_nn_conv2d_seq(X_train, Y_train, batch_size=1000, epochs=10, )
# ml.sortTopicModel(topic_model)
# ml.tweetsToCSV(tweet_list_topic_specific)
# ml.create_test_set(test_tweets)

In [None]:
# tg = TweetGraph()
# tg.main()
# tg.prepare_graph_with_attributes(tweet_data, add_sentiment=True, add_topic=True)
# tg.prepare_graph_without_attributes(tweet_data)
# tg.build_at_graph()
# tg.build_complete_graph()
# tg.build_rt_graph()
# tg.clear_all()
# tg.export_graph_gsv(gexf_file='postgrowth_atgraph.gexf')
# tg.load_data_for_graph(search_string='#blackpanther', start_date=None, end_date=None)
# tg.show_graph()

In [None]:
# tp = TweetProcessor()
# tp.add_sentiment(tweet_data, ml_sentiment)
# tp.add_topic(tweet_data, ml_topic_model)
# tp.addViralityIndex(tweet_data)
# tp.avg_tweet_length(tweet_data)
# tp.connect_DB()
# tp.create_prediction_set_sentiment(tweet_data)
# tp.create_prediction_set_sentiment_from_list(X_input=[])
# tp.create_sentence_embedding_from_list(sentence_list)
# tp.create_training_set_sentiment(tweet_data)
# tp.createTrainingSetLDATopicModel(tweet_data, extra_stopwords=[])
# tp.createTrainingSetTweetVirality(tweet_data, X, Y)
# tp.createTrainingSetWordEmbedding(tweet_data, X)
# tp.drop_duplicate_tweets(tweet_data)
# tp.filter_tweets_by_date(tweet_data)
# tp.find_duplicate_tweets(tweet_data)
# tp.getWordListAndVectors(word2vec_model)
# tp.is_number(s)
# tp.load_all_graph_tweet_data_by_search_mysql(search_string, start_date=None, end_date=None)
# tp.load_all_graph_tweet_data_by_search_mysql_eliminate_duplicate_retweets(search_string, start_date=None, end_date=None)
# tp.load_all_report_tweet_data_by_search_mysql(search_string, start_date=None, end_date=None)
# tp.load_all_retweet_data_mysql()
# tp.load_all_tweet_data_by_search_mysql(search_string, start_date=None, end_date=None)
# tp.load_all_tweet_data_mysql(search_string, start_date=None, end_date=None)
# tp.load_random_tweet_data()
# tp.load_tweet_data_by_topic_and_id(search_string, start_id=None, end_id=None, drop_duplicates=True)
# tp.load_tweets(table_name=None, org_id=0, narr_id=1, scrape_id=None)
# tp.load_tweets_csv(filepath='Sentiment Analysis Dataset.csv', num_rows=MAX_TWEETS_TO_LOAD)
# tp.load_tweets_mysql(search_string)
# tp.load_word_embedding_from_file(filepath='word2vec_twitter_model.bin')
# tp.load_wordembedding_twitter_search()
# tp.loadStopwords(stopwords_list=[])
# tp.sentence2embedding(sentence, extra_stopwords=[], dimensions=EMBEDDING_DIMENSIONS)
# tp.sentence2tokens(sentence)
# tp.trainWeightedWordEmbedding(tweet_data, max_word_dimensions=100, max_vocab_size=None, window=5, extra_stopwords=[], min_count=5, epochs=10)
# tp.trainWordEmbedding(tweet_data, max_word_dimensions=EMBEDDING_DIMENSIONS, max_vocab_size=None, window=5, extra_stopwords=[], min_count=5, epochs=10)
# tp.trainWordEmbedding_iloc(tweet_data, max_word_dimensions=EMBEDDING_DIMENSIONS, max_vocab_size=None, window=5, extra_stopwords=[], min_count=5, epochs=10)
# tp.word2vec(word)
# tp.write_to_parsing_script_report(tweet_data, filepath='parsing_script_report.csv')

___

### References

* Cloropleth Maps @ plot.ly: https://plot.ly/python/choropleth-maps/