In [47]:
from elasticsearch import Elasticsearch
import tweepy
import os
import json
import pickle
from elasticsearch import helpers

In [48]:
#connecting to the server
es = Elasticsearch(HOST = "localhost", PORT = 9200)

In [49]:
#check connection
if not es.ping():
    raise ValueError("elasticsearch connection failed")
else:
    print("elasticsearch connection successful")

elasticsearch connection successful


In [50]:
mapping = {
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0,
    "index.mapping.total_fields.limit": 5000,
    "analysis": {
      "analyzer": {
        "nlp_analyzer": {
          "type": "custom",
          "tokenizer": "tweeter_tokenizer",
          "filter": ["lowercase"]
        }
      },
      "tokenizer": {
        "tweeter_tokenizer": {
          "type": "pattern",
          "pattern": "(\\w+|\\S*[\\S*])",
          "group": 1
        }
      }
    }
  },  
    "mappings": {
      "properties": {
        "sentiment" : {
          "type": "keyword"
        },
        "created_at": {
          "type":   "date",
          "format": "EEE MMM dd HH:mm:ss Z yyyy"
        },
        "id_str":{
            "type" : "keyword"
        },
        "text":{
            "type" : "text"
        },
        "user_account_created_at": {
            "type":   "date",
            "format": "EEE MMM dd HH:mm:ss Z yyyy"
        },
        "user_id":{
            "type": "keyword"
        },
        "screen_name":{
            "type": "keyword"
        },
        "user_followers_count":{
            "type": "integer"
        },
        "user_friends_count":{
            "type": "integer"
        },
        "coordinates.coordinates": {
            "type": "geo_point"
        },
        "place.bounding_box": {
          "type": "geo_shape",
          "coerce": True,
          "ignore_malformed": True
        },
        "user_coordinates":{
          "type": "geo_point"
        },
        "user_statuses_count":{
            "type": "integer"
        },
        "tweet_length":{
            "type": "integer"
        },
        "favorite_count":{
            "type": "integer"
        },
        "retweet_count":{
            "type": "integer"
        },
        "hashtags": {
          "type": "keyword"
        },
        "author_description":{
            "type": "text"
        },
        "is_author_verified": {
              "type": "boolean"
        }
      }
  }
}

In [51]:
def createIndex(name,mappings):
    if not es.indices.exists(index = name):
        res = es.indices.create(index = name ,body = mappings ,ignore = 400)
        print(res)
        if 'acknowledged' not in res.keys() or res['acknowledged'] != True or res['index'] != name: 
            raise ValueError("index creating failed")
        else:
            print(name,":index created successfully")
    else:
        print("index exists")

In [52]:
#create the index (database name)
#EDIT NAME HERE
index_name = "twitter-depression-analysis-group3"
createIndex(index_name,mapping)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'twitter-depression-analysis-group3'}
twitter-depression-analysis-group3 :index created successfully


In [53]:
#tweepy stuff
# CONSUMER_KEY = ""
# CONSUMER_SECRET = ""
# OAUTH_TOKEN = ""
# OAUTH_TOKEN_SECRET =  ""

In [54]:
# auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
# auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
# api = tweepy.API(auth)

In [55]:
#getting the data
# file = '../data/anchor_tweets.json'
# print(file)
# ids_list = []
# f = open(file,'r')
# for tweet in f:
#         tweet = tweet.strip("\r\n")
#         tweet = json.loads(tweet)
#         ids_list.append(tweet)
# f.close()

file = "../data/new_list.pickle"
f = open(file,"rb")
cnt, ids_list = pickle.load(f)
f.close()

In [56]:
print(len(ids_list))

9961


In [57]:
def trimTweets(tweets):
    #database is only accepting 1000 fields, only take the data you need
    newTweets = []
    for t in tweets:
        data = {}
        data["_index"] = t["_index"]
        data["created_at"] = t["created_at"]
        data["id_str"] = t["id_str"]
        data["text"] = t["full_text"]
        data["user_account_created_at"] = t["user"]["created_at"]
        data["user_id"] = t["user"]["id"]
        data["screen_name"] = t["user"]["screen_name"]
        data["user_followers_count"] = t["user"]["followers_count"]
        data["user_friends_count"] = t["user"]["friends_count"]
        data["coordinates"] = t["coordinates"]
        data["place"] = t["place"]
        data["user_coordinates"] = t["user_coordinates"]
        data["statuses_count"] = t["user"]["statuses_count"]
        data["tweet_length"] = len(t["full_text"])
        data["favorite_count"] = t["favorite_count"]
        data["retweet_count"] = t["retweet_count"]
        data["hashtags"] = []
        for i in t["entities"]["hashtags"]:
            data["hashtags"].append(i["text"].lower())
        data["author_description"] = t["user"]["description"]
        data["is_author_verified"] = t["user"]["verified"]
        newTweets.append(data)
    return newTweets

In [58]:
##MAKE CHANGES HERE - GIVE HELPER.BULK A LIST OF TWEET JSONS AT A TIME - 
max_count = 1000
total_count = 0
count = 0
batch_tweets = []
for tweet in ids_list:
    count = count + 1
    total_count = total_count + 1
    batch_tweets.append(tweet)
    if count == max_count:
        for i in range(len(batch_tweets)):
#             tweets[i]["sentiment_score"] = id_list[i]
            batch_tweets[i]["_index"] = index_name
        batch_tweets = trimTweets(batch_tweets)
        res = helpers.bulk(es, batch_tweets)
        print(res)
        print(total_count,"done")
        count = 0
        batch_tweets = []
for i in range(len(batch_tweets)):
#     tweets[i]["sentiment_score"] = id_list[i]
    batch_tweets[i]["_index"] = index_name
batch_tweets = trimTweets(batch_tweets)
helpers.bulk(es, batch_tweets)
print("done")

(1000, [])
1000 done
(1000, [])
2000 done
(1000, [])
3000 done
(1000, [])
4000 done
(1000, [])
5000 done
(1000, [])
6000 done
(1000, [])
7000 done
(1000, [])
8000 done
(1000, [])
9000 done
done


In [46]:
#please dont use this recklessly
es.indices.delete(index='twitter-depression-analysis-group3', ignore=[400, 404])

{'acknowledged': True}