### Changelog 

- Separating **process_iplist()** output from VT calls to folders (todays_date)
- changed output of **process_json()** FROM "_parsed-combined" to todays_date_parsed-combined in EACH RESPECTIVE FOLDER 

### Flow

1. User edits and feeds ip.csv
2. **process_iplist()** reads each and calls VT api 
3. Responses are stored in **"downloaded_vtresponse"** and seperated into folders by **respective dates** (DDMMYYYY)
4. **process_json()** reads jsons in each folder and generates a compilation for that day in each folder

### Questions / Todo List

**Update** 

I have managed to get the scripts running from jupyter notebook but have the following questions

(a) Receiving (via a web interface) either an individual or list of domais / IP addresses --> currently it's fed via CSV, should i create a front-end for people to upload their files?

(b) Storing list of domains / IP address into a queue based list --> Is this the back-end of things? That is to say, this script runs in the back-end and whenever files come in from front-end it'll trigger the script?

(c) Carryout enrichment --> Where does this "processed in the previous X days" come from? From what i understand, should i create a check such that when new information comes in, it will look at previous histories when the IP/Domain was checked, and continue / stop accordingly? 

(d) storing responses in disk and extracting subset into DB --> is there a specific subset you'd like? DB-wise I would prefer to try NoSQL as i have no experience with it!

In [1]:
import base64
import hashlib
import json
import requests
import time
import csv
import datetime
import os
import pandas as pd
from dateutil import tz
import pytz
from pymongo import MongoClient


json_template_ip = {
    
    "ip_address": "",
    "whois_date": "",
    "last_analysis_date": "",
    "reputation": "",
    "last_analysis_stats": "",
    "total_votes": "",
    "as_owner": "",
    "country": "",
    "asn": "",
    "image":"",
    "processed_date":""
       
}

API_KEY = '0d9fdb6e32d74b9d12e3d894309531838c3aabe8d66b049fd3a7976fbedf2c68'  #@param  {type: "string"}

client = MongoClient('localhost',27017)
db = client['d_ip_enrich']
    


def process_iplist(filename_to_process, columnIndex, x_days_ago):
    
    print("======= process_iplist() START =======")
    
    # TODO: Make generalised and incorporate timestamp in foldername
    now = datetime.datetime.now()
    dt_string = now.strftime("%d%m%Y")
    d = datetime.timedelta(days = x_days_ago)
    deducted_date = (now - d).strftime("%d%m%Y")
    
    with open(filename_to_process + ".csv", newline='') as inputfile:

        if not os.path.exists("downloaded_vtresponse"):
                os.makedirs("downloaded_vtresponse")

        with open(filename_to_process + "_tracker_" + dt_string + ".csv", 'w', newline='') as outputfile:
            
            ip_list = csv.reader(inputfile, delimiter=',')
            output_writer = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

            counter = 0

            for row in ip_list:
                
                if counter == 0:
                    output_writer.writerow(row + ["Time Run"])
                    outputfile.flush()
                    counter += 1
                    continue

                print("Processing: #" + str(counter) + " - " + row[columnIndex])
                
                # Check if IP was proceeded x_days_ago, if yes, will skip calling
                file_skip = to_skip(row[0], "downloaded_vtresponse", x_days_ago)
                
                 # make dir to store API Responses
                if not os.path.exists("downloaded_vtresponse/" + dt_string):
                    os.makedirs("downloaded_vtresponse/" + dt_string)

                #Get an IP address Report
                if file_skip == 0:
                    r = requests.get("https://www.virustotal.com/api/v3/ip_addresses/"+row[0], headers={"x-apikey":API_KEY})
                    
        #with open("downloaded_vtresponse/" + row[columnIndex] +".json", "w") as outfile:
                    with open("downloaded_vtresponse/" + dt_string + "/" + row[columnIndex] + ".json", "w") as outfile:

                        outfile.write(r.text)

                    output_writer.writerow(row + [datetime.datetime.now()] + ["Processed"]) 

                    outputfile.flush()
                    time.sleep(16)

                
                else:
                    
                    output_writer.writerow(row + [datetime.datetime.now()] + ["Not Processed"]) 
                    outputfile.flush()
                    
                
                counter += 1
                
    print("======= process_iplist() END ======= \n\n")
#                 time.sleep(16)


# check if file exist in folder_to_process during x_days_ago, returns 0 or 1
def to_skip(filename, folder_to_process, x_days_ago):
    
    now = datetime.datetime.now()
    dt_string = now.strftime("%d%m%Y")
    d = datetime.timedelta(days = x_days_ago)
    deducted_date = (now - d).strftime("%d%m%Y")
    to_skip = 0

    folders = os.listdir("downloaded_vtresponse")
    folders = os.listdir(folder_to_process)
    

    for folder in folders:

        if to_skip == 1:
            break

        # target folders within X days range
        if folder >= deducted_date:
            files_array = os.listdir("downloaded_vtresponse/" + folder)
#                         print(files_array)

            for file in files_array:
                filename_filetype = file.rsplit('.',1)
    #             print(filename_filedate)

                ## if filename == target THEN SKIP + WRITE A NOTE
                if filename == filename_filetype[0]:
                    print(f"file has been processed on {folder} which is <{x_days_ago} days ago, will skip API call")
                    to_skip = 1
                    break
        
    
    return to_skip


    #     print(os.listdir("downloaded_vtresponse/"+ folder))    

def process_json_folder(folder_to_process,json_template):
    
    print("======= process_json_folder() START =======")
    
    # Get Date + Time to input later
    now = datetime.datetime.now(pytz.timezone("Singapore"))
    dt_string = now.strftime("%d%m%Y")

    # Usual Folder: downloaded_vtresponse
    combined_df = pd.DataFrame()
    
    
    for filename in os.listdir(folder_to_process):
        f = os.path.join(folder_to_process, filename)
    
#         print("f:", f)
        # check if it is a file
        if os.path.isfile(f) and f[-5:]==".json":

            print("Processing:",f)
            # Opening JSON file
            f = open(f)
#             print(f)

            # returns JSON object as
            # a dictionary
            data = json.load(f)
#             print(data)

            # load JSON template
            new_row = json_template

            # populate fields in JSON template
            for key in new_row:    
#                 print("current key", key)

                try:
                    current_value = data['data']['attributes'][key]
                    
                    # replace epoch with legible date format for whois_date and last_analysis_date
                    if key[-4:] == "date":
                        to_zone = tz.gettz('Singapore')
                        date_time = datetime.datetime.fromtimestamp( current_value )  
#                         current_value = date_time.replace(tzinfo=to_zone)
                        date_time.replace(tzinfo=to_zone)
                        current_value = date_time
                        
            
                    new_row[key] = current_value

                except Exception as e: 
               
                    if key == "processed_date":
                        new_row[key] = now
                        print("new_row[key]:", now)
                    
                    elif key == "ip_address":
                        new_row[key] = data['data']['id']
                    
                    else:
                        print(key,"not found with exception:",e)

            print("new_row:", new_row)
            db.ip.insert_one(new_row)

            df_result = pd.json_normalize(new_row)
            
            combined_df = pd.concat([combined_df, df_result], ignore_index=True, sort=False)
    
    
#     print(combined_df)
    now = datetime.datetime.now(pytz.timezone("Singapore"))
    dt_string = now.strftime("%d%m%Y")
    
    combined_df.to_csv(folder_to_process + '/' + dt_string + '_parsed-combined.csv')
    
    print("======= process_json_folder() END ======= \n\n")



# Process the list of IPs (CSVs ok but IP must be x column in the list)
# (filename.csv, column, x_days_ago)
process_iplist("ip", 0,7)


# Process the downloaded VT JSONs
# process_json_folder("downloaded_vtresponse_10Jan_combinedFull5k")
process_json_folder("downloaded_vtresponse/23022023",json_template_ip)


print("completed")
exit(0)





Processing: #1 - 34.123.194.52
file has been processed on 22022023 which is <7 days ago, will skip API call
Processing: #2 - 77.120.241.130
file has been processed on 22022023 which is <7 days ago, will skip API call
Processing: #3 - 80.253.95.99
file has been processed on 22022023 which is <7 days ago, will skip API call
Processing: #4 - 1.1.1.1


Processing: downloaded_vtresponse/23022023\1.1.1.1.json
country not found with exception: 'country'
image not found with exception: 'image'
new_row[key]: 2023-02-23 16:59:19.771463+08:00
new_row: {'ip_address': '1.1.1.1', 'whois_date': datetime.datetime(2023, 2, 14, 3, 46, 44), 'last_analysis_date': datetime.datetime(2023, 2, 23, 13, 22, 18), 'reputation': 89, 'last_analysis_stats': {'harmless': 75, 'malicious': 1, 'suspicious': 0, 'undetected': 12, 'timeout': 0}, 'total_votes': {'harmless': 73, 'malicious': 12}, 'as_owner': 'CLOUDFLARENET', 'country': '', 'asn': 13335, 'image': '', 'processed_date': datetime.datetime(2023, 2, 23, 16, 59, 19

In [9]:
json_template_general = {
    
    "DNS": "", 
    "Whois": "",
    "whois_date": "", ## CONVERT FROM EPOCH TO USER FRIENDLY DATE
    "last_analysis_date": "",
    "creation_date": "",
    "reputation": "",
    "registrar": "",
    "last_analysis_stats": "",  ## SEPERATE INTO 5 COLUMNS?
    "last_https_certificate": "",
    "categories": "",
    "total_votes": "",
    "as_owner": "",
    "country": "",
    "asn": "",
    "download_archived_page":"",
    "image":"",
    "processed_date":""     ## OWN FIELD TO CHECK X+7 DAYS
    
    
}

In [2]:
json_template_ip = {
    
    "ip_address": "",
    "whois_date": "",
    "last_analysis_date": "",
    "reputation": "",
    "last_analysis_stats": "",
    "total_votes": "",
    "as_owner": "",
    "country": "",
    "asn": "",
    "image":"",
    "processed_date":""
    
    
}

In [15]:
json_template_domain = {
    
    "DNS": "", 
    "Whois": "",
    "whois_date": "",
    "last_analysis_date": "",
    "creation_date": "",
    "reputation": "",
    "registrar": "",
    "last_analysis_stats": "",
    "last_https_certificate": "",
    "categories": "",
    "total_votes": "",
    "download_archived_page":"",
    "image":"",
    "processed_date":""
    
    
}

In [19]:
for key in json_template_ip:
    json_template_ip[key] = 1 
    
json_template_ip

{'whois_date': 1,
 'last_analysis_date': 1,
 'reputation': 1,
 'last_analysis_stats': 1,
 'total_votes': 1,
 'as_owner': 1,
 'country': 1,
 'asn': 1,
 'image': 1}

In [21]:
import json
f = open("downloaded_vtresponse/22022023/34.123.194.52.json")
data = json.load(f)
data['data'].keys()
data['data']['type']

TypeError: string indices must be integers

In [7]:
import json
import datetime


now = datetime.datetime.now()
dt_string = now.strftime("%d%m%Y")
    
f = open("downloaded_vtresponse/22022023/34.123.194.52.json")

new_row = json_template_ip 
loaded_json = json.load(f)
                

loaded_json['data']['id']

'34.123.194.52'

In [2]:
import datetime
from dateutil import tz

epoch_time = 1676767998

date_time = datetime.datetime.fromtimestamp( epoch_time )  

print("utc:", date_time)

to_zone = tz.gettz('Singapore')
new_date_time = date_time.replace(tzinfo=to_zone)


print("sgt:", new_date_time)



utc: 2023-02-19 08:53:18
sgt: 2023-02-19 08:53:18-07:00


'16022023'

In [22]:
import os
import datetime


now = datetime.datetime.now()
dt_string = now.strftime("%d%m%Y")
d = datetime.timedelta(days = 7)
a = now - d
a = a.strftime("%d%m%Y")
a

print(a)

folders = os.listdir("downloaded_vtresponse")

for folder in folders:
    if folder >= a:
        files_array = os.listdir("downloaded_vtresponse/" + folder)
        print(files_array)
        
        for file in files_array:
#             filename_filedate = file.split('_')
#             print(filename_filedate)
            
            ## if filename == target THEN SKIP + WRITE A NOTE

        print(folder)
        
#     print(os.listdir("downloaded_vtresponse/"+ folder))
    

16022023
['22022023_parsed-combined.csv', '34.123.194.52.json', '77.120.241.130.json', '80.253.95.99.json']
['22022023', 'parsed-combined.csv']
['34.123.194.52.json']
['77.120.241.130.json']
['80.253.95.99.json']
22022023
['23022023_parsed-combined.csv', '34.123.194.52.json', '77.120.241.130.json', '80.253.95.99.json']
['23022023', 'parsed-combined.csv']
['34.123.194.52.json']
['77.120.241.130.json']
['80.253.95.99.json']
23022023


In [15]:
test = "34.123.194.52"
test.rsplit('.',1)

['34.123.194', '52']

In [5]:
from pymongo import MongoClient
client = MongoClient('localhost',27017)
db = client['test']
db
# db.student.insert({"Akshay":500})

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'test')

In [4]:
collection = db.test_collection
collection

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'test'), 'test_collection')

In [6]:
collection.insert_one({"Test":"Hello"})

<pymongo.results.InsertOneResult at 0x2108f0a8af0>

In [7]:
import datetime
import pytz
now = datetime.datetime.now(pytz.timezone("Singapore"))
now



datetime.datetime(2023, 2, 23, 16, 12, 31, 470966, tzinfo=<DstTzInfo 'Singapore' +08+8:00:00 STD>)

In [None]:
""