In [17]:
'''
NAME    : search_compilation
PURPOSE : Process raw search and extract IP, nick, hostel and query data from it.
INPUT   : query_input.txt
OUTPUT  : query_output.csv

NOTE:
--- F?T?0?9? : NICK + TTH
--- F?T?0?9? : IP   + TTH
--- F?T?0?1? : IP   + QUERY
--- F?T?0?1? : NICK + QUERY
'''

# File Name
file_name = 'search_compilation'
input_file = 'query_input#.txt'
output_file = 'query_output.csv'

# PRELIMINARY

In [18]:
'''IMPORTING LIBRARIES'''

import pandas as pd
import numpy as np
import csv
import os
from pathlib import Path

pd.set_option('display.max_rows', None)

In [19]:
'''REMOVING PREVIOUS OUTPUT FILES'''

for i in ['query_output.csv'] :
    if Path(i).is_file():
            os.remove(i)

# READING INPUT

In [20]:
'''READING INPUT FILE'''

file = open(input_file,"r+")
query_input = file.readlines()
file.close()

#print(query_input[141])
#print(query_input[319])
query_input[:5]

['0 [In] 10.4.5.10:500 (Hub <10.4.5.10:500>): $Search 10.3.9.24:5770 F?T?0?9?TTH:IXMXMEU5B5SGPLSCVQ6PNKAV67BUFJMVB67THSI\n',
 '1 [In] 10.4.5.10:500 (Hub <10.4.5.10:500>): $Search Hub:bunnydsm F?T?0?9?TTH:U6MGXOFEEDTKXOHLV6EZAQ6IXKXKX6UYMBFROQI\n',
 '2 [In] 10.4.7.111:500 (Hub <10.4.7.111:500>): $Search Hub:fyfyfyfyfyfyfyfyffyfyfy F?T?0?9?TTH:2IFHEE6TGFCKKRUL7MZXLKW54HDTOVXYUJLDOCQ\n',
 '3 [In] 10.4.7.111:500 (Hub <10.4.7.111:500>): $Search Hub:Oranos F?T?0?9?TTH:QDW3TODI3MWPIQMQKA6IXBCLDBPOQCLKNZW4VRA\n',
 '4 [In] 10.4.5.10:500 (Hub <10.4.5.10:500>): $Search Hub:Oranos F?T?0?9?TTH:QSFQSRTVYNJFWBFFAL3DO74Z7J727QRY27AI27Y\n']

# Query List

In [21]:
'''DEFINING QUERY LIST'''

query_list = []
query_list.append(["Token", "Reference", "Hub", "IP", "Nick", "Query"])

In [22]:
'''ADDING QUERIES TO QUERY LIST'''

for i in range(len(query_input)):
    line = query_input[i]
    
    if line.find('F?T?0?1?') == -1:
        # Search for a specific file TTH, skip for now.
        continue
    else:
        if line.find('Hub:') == -1:
            # reference = IP
            rip_start = line.find('$Search') + 8
            rip_end = line.find('F?T?0?1?') - 1
            rip = line[rip_start:rip_end]
            reference = rip
            ip = rip
            nick = '-'
            
        else:               
            # reference = nick        
            rnick_start = line.find('$Search Hub') + 12                  
            rnick_end = line.find('F?T?0?1?') - 1
            rnick = line[rnick_start:rnick_end] 
            reference = rnick
            ip = '-'
            nick = rnick
         
        # Define query
        query_start = line.find('F?T?0?1?') + 8
        query = line[query_start:].replace('$', ' ')[:-1]               
        
        # Define token
        token_end = line.find("[") - 1
        token = line[:token_end]                                                                     
         
        # Define hub
        if line.find('10.4.7.111:500') == -1:                           
            hub = 'Woodstock'
        else:
            hub = 'Titan'
        
        # Add query element to query_list
        query_list.append([token, reference, hub, ip, nick, query])


In [23]:
'''SAVING QUERY LIST'''

myFile = open(output_file, 'w')
with myFile:
    writer = csv.writer(myFile)
    writer.writerows(query_list)

# Analysis

In [24]:
'''IMPORTING FORMATTED QUERY LIST'''

query_df = pd.read_csv(output_file)
print("Total Queries made : ", len(query_df))

query_df.head(20)
#query_df.sort_values(by = 'Nick', ascending=False)
#stats.sort_values(by = 'IP', ascending=False)

Total Queries made :  6073


Unnamed: 0,Token,Reference,Hub,IP,Nick,Query
0,141,10.4.11.151:5134,Woodstock,10.4.11.151:5134,-,kayden kross
1,149,10.4.11.151:5134,Woodstock,10.4.11.151:5134,-,holly michaels
2,160,10.3.15.12:56411,Woodstock,10.3.15.12:56411,-,bootcamp
3,319,Tohra__Bappa,Titan,-,Tohra__Bappa,power systems
4,320,Tohra__Bappa,Woodstock,-,Tohra__Bappa,power systems
5,346,Tohra__Bappa,Titan,-,Tohra__Bappa,power system engineering
6,347,Tohra__Bappa,Woodstock,-,Tohra__Bappa,power system engineering
7,361,Tohra__Bappa,Titan,-,Tohra__Bappa,power electronics
8,363,Tohra__Bappa,Woodstock,-,Tohra__Bappa,power electronics
9,378,thepunisher...,Woodstock,-,thepunisher...,mup labs


In [25]:
'''MOST QUERIES BY A NICK'''

most = query_df.groupby('Nick',).size()
most = most.sort_values(ascending=False)[1:]
most.head(15)

Nick
chal_hat            60
Greay               54
123456helloworld    52
q;qofeivneo[in      49
baba_bakchod        42
108                 42
indiejjhsauhx2a     40
jellybear           38
Dumber              34
AstonMartin         30
Stratocaster        30
barnaana            29
alamara             29
brokamladi          28
Capt.America        28
dtype: int64

In [26]:
print("Total Length : " + str(len(query_input)))
print("Query Length : " + str(len(query_df)))
print("Search Ratio : " + str(round((len(query_input)/len(query_df)),2)))

Total Length : 50833
Query Length : 6073
Search Ratio : 8.37


# BACKUP

In [28]:
'''BACKUP'''

# Provide backup_as
backup_as = 'query_backup_1.csv'

# Exporting the 'master' dataframe to csv file as a backup
query_df.to_csv(os.path.join('query_backup',backup_as), index=False, header=True)