In [54]:
'''
NAME    : search_compilation
PURPOSE : Process raw search and extract IP, nick, hostel and query data from it.
INPUT   : query_input.txt
OUTPUT  : query_output.csv

NOTE:
--- F?T?0?9? : NICK + TTH
--- F?T?0?9? : IP   + TTH
--- F?T?0?1? : IP   + QUERY
--- F?T?0?1? : NICK + QUERY
'''

# File Name
file_name = 'search_compilation'
input_file = 'query_input.txt'
output_file = 'query_output.csv'

# PRELIMINARY

In [55]:
'''IMPORTING LIBRARIES'''

import pandas as pd
import numpy as np
import csv
import os
from pathlib import Path

pd.set_option('display.max_rows', None)

In [56]:
'''REMOVING PREVIOUS OUTPUT FILES'''

for i in ['query_output.csv'] :
    if Path(i).is_file():
            os.remove(i)

# READING INPUT

In [107]:
'''READING INPUT FILE'''

file = open(input_file,"r+")
query_input = file.readlines()
file.close()

#print(list[141])
#print(list[319])
list[:5]

['0 [In] 10.4.7.111:500 (Hub <10.4.7.111:500>): $Search Hub:the_dark_passenger F?T?0?9?TTH:K24JLCG5YPFIERJWOENKAW4776ZESEUOCLFKXYI\n',
 '1 [In] 10.4.5.10:500 (Hub <10.4.5.10:500>): $HubName WoodStock - Welcome to Wood$tock\n',
 '2 [In] 10.4.7.111:500 (Hub <10.4.7.111:500>): $HubName TITAN - Lost One Plus 5 (Slate Grey) Near AH6 or C- Mess. Contact- 9988112646 / 9167790456\n',
 '3 [In] 10.4.5.10:500 (Hub <10.4.5.10:500>): $Search Hub:SSG F?T?0?9?TTH:EFBEX2XQ6U6J4GPD4LUH4BFY43JQRFAMEQ3SNVY\n',
 '4 [Out] 10.4.5.10:500 (Hub <10.4.5.10:500>): $SR Jedaiah _BITS\\ID Pics\\2017 Batch ID pics [All]\\2017A7PS0107.jpg\x051676638 5/5\x05TTH:EFBEX2XQ6U6J4GPD4LUH4BFY43JQRFAMEQ3SNVY (10.4.5.10:500)\x05SSG|\n']

# Query List

In [58]:
'''DEFINING QUERY LIST'''

query_list = []
query_list.append(["Token", "Reference", "Hub", "IP", "Nick", "Query"])

In [59]:
'''ADDING QUERIES TO QUERY LIST'''

for i in range(len(query_input)):
    line = query_input[i]
    
    if line.find('F?T?0?1?') == -1:
        # Search for a specific file TTH, skip for now.
        continue
    else:
        if line.find('Hub:') == -1:
            # reference = IP
            rip_start = line.find('$Search') + 8
            rip_end = line.find('F?T?0?1?') - 1
            rip = line[rip_start:rip_end]
            reference = rip
            ip = rip
            nick = '-'
            
        else:               
            # reference = nick        
            rnick_start = line.find('$Search Hub') + 12                  
            rnick_end = line.find('F?T?0?1?') - 1
            rnick = line[rnick_start:rnick_end] 
            reference = rnick
            ip = '-'
            nick = rnick
         
        # Define query
        query_start = line.find('F?T?0?1?') + 8
        query = line[query_start:].replace('$', ' ')[:-1]               
        
        # Define token
        token_end = line.find("[") - 1
        token = line[:token_end]                                                                     
         
        # Define hub
        if line.find('10.4.7.111:500') == -1:                           
            hub = 'Woodstock'
        else:
            hub = 'Titan'
        
        # Add query element to query_list
        query_list.append([token, reference, hub, ip, nick, query])


In [60]:
'''SAVING QUERY LIST'''

myFile = open(output_file, 'w')
with myFile:
    writer = csv.writer(myFile)
    writer.writerows(query_list)

# Analysis

In [100]:
'''IMPORTING FORMATTED QUERY LIST'''

query_df = pd.read_csv(output_file)
print("Total Queries made : ", len(query_df))

query_df.head(20)
#query_df.sort_values(by = 'Nick', ascending=False)
#stats.sort_values(by = 'IP', ascending=False)

Total Queries made :  2839


Unnamed: 0,Token,Reference,Hub,IP,Nick,Query
0,26,10.3.9.172:6482,Woodstock,10.3.9.172:6482,-,a|
1,27,10.3.9.172:6482,Titan,10.3.9.172:6482,-,a|
2,3363,M213121,Woodstock,-,M213121,world of warcraft
3,3367,M213121,Titan,-,M213121,world of warcraft
4,3614,10.3.9.172:6482,Woodstock,10.3.9.172:6482,-,movies|
5,3615,10.3.9.172:6482,Titan,10.3.9.172:6482,-,movies|
6,6473,10.4.9.100:5292,Titan,10.4.9.100:5292,-,seborg
7,6647,lazylad,Woodstock,-,lazylad,jumanji
8,6650,lazylad,Titan,-,lazylad,jumanji
9,6965,indiejjhsauhx2a,Titan,-,indiejjhsauhx2a,sherlock


In [99]:
'''MOST QUERIES BY A NICK'''

most = query_df.groupby('Nick',).size()
most = most.sort_values(ascending=False)[1:]
most.head(15)

Nick
pnfodd             56
earphone           41
SlySeeker          36
cnnccsc            28
sar                26
ramboo123          23
wolfking01         20
ZeusGamer          19
snorlax            19
SNORLAX_R395       19
suniyo             18
fernanado          18
xyzabc             16
laud_lele_bc_bc    16
baba_bakchod       16
dtype: int64

In [68]:
print("Total Length : " + str(len(query_input)))
print("Query Length : " + str(len(query_df)))
print("Search Ratio : " + str(round((len(list)/len(query_df)),2)))

Total Length : 87909
Query Length : 2839
Search Ratio : 30.96
