In [1]:
from elasticsearch import Elasticsearch

In [2]:
es = Elasticsearch('http://localhost:9200')

In [3]:
mapping = {
    "mappings": {
        "properties": {
            "date_posted": {"type": "date"},
            "funny": {"type": "integer"},
            "helpful": {"type": "integer"},
            "hour_played": {"type": "integer"},
            "is_early_access_review": {"type": "boolean"},
            "recommendation": {"type": "keyword"},
            "review": {"type": "text"},
            "title": {"type": "text"},
            "sentiment": {"type": "keyword"},
            "Dominant_Topic": {"type": "integer"},
            "Perc_Contribution": {"type": "float"},
            "Topic_Keywords": {"type": "text"}
        }
    }
}

# Create the index
es.indices.create(index="reviews", body=mapping)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [reviews/LAIkQU0eRfCUebwc3g4FtA] already exists')

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('topic.csv')
df.head(3)

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title,sentiment,Dominant_Topic,Perc_Contribution,Topic_Keywords
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns,Negative,1,0.9439,"get, server, like, best, play, people, time, e..."
1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns,Neutral,0,0.6532,"like, time, get, fun, play, still, even, playe..."
2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns,Positive,0,0.5229,"like, time, get, fun, play, still, even, playe..."


In [15]:
df.groupby('title').size()

title
ACE COMBAT™ 7: SKIES UNKNOWN                         10
ARK: Survival Evolved                               195
ASTRONEER                                          2661
Battlefleet Gothic: Armada 2                         10
Beat Saber                                           10
Cold Waters                                          10
Dead by Daylight                                  22221
Divinity: Original Sin 2 - Definitive Edition       190
Don't Starve Together                               165
Euro Truck Simulator 2                              501
Expansion - Hearts of Iron IV: Man the Guns           3
Factorio                                            170
Farming Simulator 19                                 10
Football Manager 2019                                10
Foundation                                           10
GOD EATER 3                                          10
Garry's Mod                                         202
Grand Theft Auto V                        

In [11]:
df.fillna(value={"review": "Unknown", "sentiment": "Unknown"}, inplace=True)
df.isna().sum()

date_posted               0
funny                     0
helpful                   0
hour_played               0
is_early_access_review    0
recommendation            0
review                    0
title                     0
sentiment                 0
Dominant_Topic            0
Perc_Contribution         0
Topic_Keywords            0
dtype: int64

In [12]:
df.describe()

Unnamed: 0,funny,helpful,hour_played,Dominant_Topic,Perc_Contribution
count,434891.0,434891.0,434891.0,434891.0,434891.0
mean,533302.4,1.004114,364.130773,1.125022,0.725398
std,47856400.0,59.462935,545.961198,0.821322,0.161826
min,0.0,0.0,0.0,0.0,0.3333
25%,0.0,0.0,62.0,0.0,0.6239
50%,0.0,0.0,190.0,1.0,0.7394
75%,0.0,0.0,450.0,2.0,0.8597
max,4294967000.0,28171.0,31962.0,2.0,0.9998


In [None]:
from scipy import stats
df['funny_z_score'] = stats.zscore(df['funny'])
df_filtered = df[df['funny_z_score'].abs() <= 3]  # threshold is typically 2 or 3


In [None]:
# Assuming 'df' is your pandas DataFrame
for index, row in df_filtered.iterrows():
    # Convert row to JSON (dict)
    document = row.to_dict()
    if row['funny'] > 2147483647:
        row['funny'] = 2147483647
    # Index the document
    es.index(index="reviews", id=index, document=document)


In [None]:
query = {
    "query": {
        "match_all": {}  # This is a simple query that matches all documents
    }
}

# Execute the search query against a specific index
response = es.search(index="reviews", body=query)

# Print the response
print("Got %d Hits:" % response['hits']['total']['value'])
for hit in response['hits']['hits']:
    print(hit["_source"])

Got 10000 Hits:
{'date_posted': '2018-09-29', 'funny': 0, 'helpful': 0, 'hour_played': 1458, 'is_early_access_review': False, 'recommendation': 'Recommended', 'review': "the best! I've been playing since it came out", 'title': 'Dead by Daylight', 'sentiment': 'Positive', 'Dominant_Topic': 1, 'Perc_Contribution': 0.7249, 'Topic_Keywords': 'get, server, like, best, play, people, time, ever, one, rust', 'funny_z_score': -0.011143816181542173}
{'date_posted': '2018-11-01', 'funny': 1, 'helpful': 1, 'hour_played': 29, 'is_early_access_review': False, 'recommendation': 'Recommended', 'review': "If you're against Generator abuse and want to support generators I suggest you buy this game and help fixing them!", 'title': 'Dead by Daylight', 'sentiment': 'Negative', 'Dominant_Topic': 0, 'Perc_Contribution': 0.5488, 'Topic_Keywords': 'like, time, get, fun, play, still, even, player, one, really', 'funny_z_score': -0.011143795285673328}
{'date_posted': '2018-04-16', 'funny': 0, 'helpful': 0, 'hour