# Update Database Structure

Changes in the data stored and format will affect how the information is processed and stored. An update method was created to change the storage.



In [1]:
import ipywidgets as widgets
from IPython.core.display import display, HTML, update_display
import json, os, pickle
from random import seed, randint
from tweet_requester.analysis import TweetAnalyzer
from tweet_requester.display import TweetInteractiveClassifier, \
JsonLInteractiveClassifier, TSess, prepare_google_credentials, PROCESSING_STAGES, logging
from twitter_secrets import C_BEARER_TOKEN 
JL_DATA="./tweetsRickyRenuncia-final.jsonl"
BASE_DIR="./Evaluating Content"
# Update database
#April 30, 2021 the RR team rehydrated with twarc their data.
april302021 = 1619755200.0
# git_commit="9219b7a01ce28f5bc0d61c913b3f914f967614fd"
git_commit="2ac78595cceef98a56c518c24f2187360e1527e3"
tweet_session = TSess(
        C_BEARER_TOKEN, 
        compression_level=5, 
        sleep_time=3, 
        cache_dir="./tweet_cache/", 
        hash_split=True
    )
google_credentials = prepare_google_credentials(
    credentials_file="./google_translate_keys.json"
)

In [2]:
classifier = JsonLInteractiveClassifier(
    tweet_ids_file="tweetsRickyRenuncia-final.txt", 
    session=tweet_session, mute=True, 
    google_credentials=google_credentials,
    pre_initialized=True, sqlite_db="tweets.db"
)

In [3]:
classifier.close()

In [4]:
import logging
logging.basicConfig(level=logging.WARNING)
classifier.update_database_v01_v02(dateCreated=april302021, git_commit=git_commit)
classifier.update_database_v02_v03(git_commit=git_commit)
classifier.update_database_v03_v04(git_commit=git_commit)



In [4]:
classifier.connect()
cur = classifier.cursor()

cur.execute("""
SELECT state, count(*) from tweet
GROUP BY state ORDER BY state;""")
rows = cur.fetchall()
print("{:>25} | {:<8}".format("PROCESSING_STAGE", "COUNT"))
print("{:>25} | {:<8}".format("-"*25, "-"*8))
for row in rows:
    print("{:>25} | {:<8}".format(PROCESSING_STAGES(row[0]).name, row[1]))

cur.execute("""
SELECT * from tweet
WHERE tweet_id in (
SELECT tweet_id FROM tweet
WHERE state in (?));""",
(PROCESSING_STAGES.PREPROCESSED.value,))
rows_sample = cur.fetchall()

print("\n\nSample: ")
n=0
cur.close()
for row in rows_sample:
    print("\t",row)
    n+=1
    if n > 4:
        break

         PROCESSING_STAGE | COUNT   
------------------------- | --------
              UNPROCESSED | 493031  
                REVIEWING | 373     
                FINALIZED | 68      
     UNAVAILABLE_EMBEDING | 1328    
                  RETWEET | 2814    
             PREPROCESSED | 2714    


Sample: 
	 ('1002186716046864386', 6)
	 ('1102716035176775681', 6)
	 ('1138785914757533696', 6)
	 ('1148321742697504769', 6)
	 ('1149490876592218113', 6)


In [5]:
classifier.display_accepted(page=3, per_page=3)

In [5]:
classifier.StartEvaluations()

In [6]:
classifier.connect()
cur = classifier.cursor()
cur.execute("""
SELECT * from tweet
WHERE tweet_id in (
SELECT tweet_id FROM tweet
WHERE state in (?));""",
(PROCESSING_STAGES.REVIEWING.value,))

rows = cur.fetchall()
n=0
cur.close()
for row in rows:
    print(row)
    classifier.tweet_set_state(
        tweet_id=row[0],
        state=PROCESSING_STAGES.UNPROCESSED
    )
    n+=1
    if n > 9:
        break


('1150839690184069127', 1)
('1150842425390317573', 1)
('1150848287299244036', 1)
('1150849765376241664', 1)
('1150850130704248833', 1)
('1150857492173398018', 1)
('1150859636851040256', 1)
('1150862160760909824', 1)
('1150862260237164545', 1)
('1150863149429592064', 1)


In [7]:
page=5
per_page=5
classifier.display_accepted(page=page, per_page=per_page)

In [None]:
from datetime import datetime
from time import sleep
import logging
last_pull=datetime.now().timestamp()-900
current_time=end = datetime.now().timestamp()
while True:
    if current_time - last_pull > 900:
        start_pull = datetime.now().timestamp()
        try:
            classifier.preprocess_batch(n=150)
        except Exception as err:
            logging.error(err)
            break
        # Average the download time to the middle of the transaction.
        last_pull = (start_pull + datetime.now().timestamp())/2.0
    else:
        current_time = datetime.now().timestamp()
        # sleep for time left for 15 minutes
        sleep(900 - (current_time - last_pull))
        current_time = datetime.now().timestamp()

In [8]:
classifier.preprocess_batch(n=250)

In [10]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install tweet-requester

Collecting tweet-requester
  Downloading tweet_requester-0.0.1-py3-none-any.whl (21 kB)
Collecting google-cloud-translate>=3.3.1
  Downloading google_cloud_translate-3.3.2-py2.py3-none-any.whl (104 kB)
[K     |████████████████████████████████| 104 kB 367 kB/s eta 0:00:01
[?25hCollecting ipython>=7.25.0
  Downloading ipython-7.26.0-py3-none-any.whl (786 kB)
[K     |████████████████████████████████| 786 kB 443 kB/s eta 0:00:01
Collecting matplotlib-inline
  Using cached matplotlib_inline-0.1.2-py3-none-any.whl (8.2 kB)
Installing collected packages: matplotlib-inline, ipython, google-cloud-translate, tweet-requester
  Attempting uninstall: ipython
    Found existing installation: ipython 7.20.0
    Uninstalling ipython-7.20.0:
      Successfully uninstalled ipython-7.20.0
  Attempting uninstall: google-cloud-translate
    Found existing installation: google-cloud-translate 3.2.1
    Uninstalling google-cloud-translate-3.2.1:
      Successfully uninstalled google-cloud-translate-3.2.1
