# Twitch Chat Scraping


In [6]:
### Imports

from Twitch_key import * ### I made this .py file to house my Twitch_API_KEY
import socket

import logging
import time
from emoji import demojize


import pandas as pd
from datetime import datetime
import re
import seaborn as sns
import matplotlib as plt

In [2]:
#######  Setting up the Twitch IRC Socket - Part 1: General Info ####### 

server = 'irc.chat.twitch.tv' #twitch provided
port = 6667 #twitch provided
nick = 'b_e_green' #your OWN twitch account
token = Twitch_key() #however you'd like to generate your oauth token
channel = '#adinross' #the channel you want to scrape from; change to whomever!

In [3]:
####### Pulling comments via the Connection ####### 

### Create a log file that saves your pull!
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s — %(message)s',
                    datefmt='%Y-%m-%d_%H:%M:%S',
                    handlers=[logging.FileHandler('chat.log', encoding='utf-8')])


### Connect to Socket
sock = socket.socket()
sock.connect((server, port))

sock.send(f"PASS {token}\n".encode('utf-8'))
sock.send(f"NICK {nick}\n".encode('utf-8'))
sock.send(f"JOIN {channel}\n".encode('utf-8'))


### Loop for pull - We included a timer to run for a certain amount of seconds
start_time = time.time()
seconds = input("Enter: ")
seconds = int(seconds)
while True:
    current_time = time.time()
    elapsed_time = current_time - start_time

    resp = sock.recv(2048).decode('utf-8')

    if resp.startswith('PING'):
        sock.send("PONG\n".encode('utf-8'))
    
    elif len(resp) > 0: # if you'd like to remove emojis from the text; Twitch uses a lot of them!
        logging.info(demojize(resp))
        
    if elapsed_time > seconds:
        print("Finished iterating in: " + str(int(elapsed_time))  + " seconds")
        break
        sock.close() #closes the socket after your time runs out

Enter: 120
Finished iterating in: 278 seconds


In [4]:
####### Generate a dataframe from the comments you pulled ####### 

### function to create dataframe from chat snippet
def get_chat_dataframe(file):
    data = []

    with open(file, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n\n')
        
        for line in lines:
#             print(line)
            try:
                cmnt_time = line.split('—')[0].strip()
                cmnt_time = datetime.strptime(cmnt_time, '%Y-%m-%d_%H:%M:%S')

                msg = line.split('—')[1:]
                msg = '—'.join(msg).strip()

                username, channel, message = re.search(
                    ':(.*)\!.*@.*\.tmi\.twitch\.tv PRIVMSG #(.*) :(.*)', msg
                ).groups()

                d = {
                    'dt': cmnt_time,
                    'channel': channel,
                    'username': username,
                    'message': message
                }

                data.append(d)
            
            except Exception:
                pass
            
    return pd.DataFrame(data)
        

In [7]:
####### Run DF function #######

df = get_chat_dataframe('chat.log')

In [8]:
####### Take a look at your dataframe! #######

#df.set_index('dt', inplace=True) #use if you want the index to be the time variable

### The shape of your DF
print(df.shape)


### Head & Tail of DF
display(df.head(), df.tail())


### What are the column data types for your DF?

print(df.dtypes)

(58, 4)


Unnamed: 0,dt,channel,username,message
0,2021-09-05 02:35:55,adinross,iskims,no elp
1,2021-09-05 02:35:56,adinross,certifiedcartistan,the swat is comin
2,2021-09-05 02:35:56,adinross,sync_____________________,!sub for no ads !prime to sub for free
3,2021-09-05 02:35:56,adinross,tj_is_heartless,Loooool idk bro
4,2021-09-05 02:35:56,adinross,aliayah_islit07,unplug his life support dummy


Unnamed: 0,dt,channel,username,message
53,2021-09-05 02:36:11,adinross,1gpa,suck billy off
54,2021-09-05 02:36:12,adinross,xxfantomboy25,Yo
55,2021-09-05 02:36:13,adinross :ACTION :star: NEW SUB :star: alexg_...,wizebot,star:
56,2021-09-05 02:36:13,adinross,prodbyrisen,OMG
57,2022-03-17 21:40:50,adinross,frostic120,Is lazybones going live


dt          datetime64[ns]
channel             object
username            object
message             object
dtype: object


In [14]:
####### Explore #######

print(df.username.value_counts()[:10], '\n\n##############\n\n')

print(df.channel.value_counts()[:10], '\n\n##############\n\n')

finesse_559                  4
wizebot                      4
lukeonrtx                    3
iskims                       3
sync_____________________    3
mikebintrippin               2
tlgoatttv                    2
mosesari305                  2
neveraidan                   1
playboybanks                 1
Name: username, dtype: int64 

##############


adinross                                                       54
adinross :ACTION :star: NEW SUB :star: alexg_409 (+1693)       1
adinross :ACTION :star: NEW SUB :star: LucidCiC (+1692)        1
adinross :ACTION :star: NEW SUB :star: iameliteeee (+1691)     1
adinross :ACTION :star: NEW SUB :star: tecap (+1690)           1
Name: channel, dtype: int64 

##############


