# Twitch Chat Scraping

Main source of code: https://www.learndatasci.com/tutorials/how-stream-text-data-twitch-sockets-python/

In [1]:
### Setting up the Twitch IRC Socket - Part 1: General Info

from Twitch_key import *
import socket

server = 'irc.chat.twitch.tv' #twitch provided
port = 6667 #twitch provided
nick = 'b_e_green' #your OWN twitch account
token = Twitch_key() #however you'd like to generate your oauth token
channel = '#adinross' #the channel you want to scrape from; change to whomever!

In [2]:
### Pulling comments via the Connection 

import logging
import time
from emoji import demojize


# Create a log file that saves your pull!
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s — %(message)s',
                    datefmt='%Y-%m-%d_%H:%M:%S',
                    handlers=[logging.FileHandler('chat.log', encoding='utf-8')])


# Connect to Socket
sock = socket.socket()
sock.connect((server, port))

sock.send(f"PASS {token}\n".encode('utf-8'))
sock.send(f"NICK {nick}\n".encode('utf-8'))
sock.send(f"JOIN {channel}\n".encode('utf-8'))


# Loop for pull - We included a timer to run for a certain amount of seconds
start_time = time.time()
seconds = input("Enter: ")
seconds = int(seconds)
while True:
    current_time = time.time()
    elapsed_time = current_time - start_time

    resp = sock.recv(2048).decode('utf-8')

    if resp.startswith('PING'):
        sock.send("PONG\n".encode('utf-8'))
    
    elif len(resp) > 0:
        logging.info(demojize(resp))
        
    if elapsed_time > seconds:
        print("Finished iterating in: " + str(int(elapsed_time))  + " seconds")
        break
        sock.close() #closes the socket after your time runs out

Enter: 25
Finished iterating in: 25 seconds


In [3]:
### Generate a dataframe from the comments you pulled

import pandas as pd
from datetime import datetime
import re

def get_chat_dataframe(file):
    data = []

    with open(file, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n\n')
        
        for line in lines:
#             print(line)
            try:
                cmnt_time = line.split('—')[0].strip()
                cmnt_time = datetime.strptime(cmnt_time, '%Y-%m-%d_%H:%M:%S')

                msg = line.split('—')[1:]
                msg = '—'.join(msg).strip()

                username, channel, message = re.search(
                    ':(.*)\!.*@.*\.tmi\.twitch\.tv PRIVMSG #(.*) :(.*)', msg
                ).groups()

                d = {
                    'dt': cmnt_time,
                    'channel': channel,
                    'username': username,
                    'message': message
                }

                data.append(d)
            
            except Exception:
                pass
            
    return pd.DataFrame(data)
        

In [4]:
df = get_chat_dataframe('chat.log')

2021-09-05_02:35:55 — :tmi.twitch.tv 001 b_e_green :Welcome, GLHF!
:tmi.twitch.tv 002 b_e_green :Your host is tmi.twitch.tv
:tmi.twitch.tv 003 b_e_green :This server is rather new
:tmi.twitch.tv 004 b_e_green :-
:tmi.twitch.tv 375 b_e_green :-
:tmi.twitch.tv 372 b_e_green :You are in a maze of twisty passages, all alike.
:tmi.twitch.tv 376 b_e_green :>
:b_e_green!b_e_green@b_e_green.tmi.twitch.tv JOIN #adinross
:b_e_green.tmi.twitch.tv 353 b_e_green = #adinross :b_e_green
:b_e_green.tmi.twitch.tv 366 b_e_green #adinross :End of /NAMES list
:iskims!iskims@iskims.tmi.twitch.tv PRIVMSG #adinross :no elp
:bd9jackhammer_!bd9jackhammer_@bd9jackhammer_.tmi.twitch.tv PRIVMSG #adinross :That mf gettin penetrated
:finesse_559!finesse_559@finesse_559.tmi.twitch.tv PRIVMSG #adinross :L GAME
:drelordtv!drelordtv@drelordtv.tmi.twitch.tv PRIVMSG #adinross :BROO
:jacuzzi_birth!jacuzzi_birth@jacuzzi_birth.tmi.twitch.tv PRIVMSG #adinross :your so sexy
:finesse_559!finesse_559@finesse_559.tmi.twitch.tv P

In [5]:
### Take a look at your dataframe!

#df.set_index('dt', inplace=True) #use if you want the index to be the time

print(df.shape)

df.head()

(57, 4)


Unnamed: 0,dt,channel,username,message
0,2021-09-05 02:35:55,adinross,iskims,no elp
1,2021-09-05 02:35:56,adinross,certifiedcartistan,the swat is comin
2,2021-09-05 02:35:56,adinross,sync_____________________,!sub for no ads !prime to sub for free
3,2021-09-05 02:35:56,adinross,tj_is_heartless,Loooool idk bro
4,2021-09-05 02:35:56,adinross,aliayah_islit07,unplug his life support dummy
