# Social Bots and Echo Chamber Detection

The cell below defines the abstract class whose API you will need to impement. Do NOT modify it.

In [6]:
# DO NOT MODIFY THIS CELL

from abc import ABC, abstractmethod  

class AbstractBotEchoDetection(ABC):
    
    # constructor
    @abstractmethod
    def __init__(self):
        pass           
        
    # data initialisation
    @abstractmethod
    def createNetwork(self):
        pass

    # prints and returns a list of all detected social bots
    # bots : list
    @abstractmethod
    def detectSocialBots(self):  
        bots = []
        return bots 
    
    # prints and returns a list of all detected strongly connected components
    # scc : list
    @abstractmethod
    def detectStronglyConnectedComponents(self):     
        scc = []
        return scc

Use the cell below to define any data structure and auxiliary python function you may need. Leave the implementation of the main API to the next code cell instead. 

In [7]:
# ADD YOUR DATA STRUCTURE DEFINITIONS AND HELPER CODE HERE



Use the cell below to implement the requested API. If you have been experimenting with different social bot / echo chamber detection algortihms, the code below should report your final choice only.

In [3]:
import random
from sklearn.linear_model import LogisticRegression

class BotEchoDetection(AbstractBotEchoDetection):
    
    def serializeDict(dictionary, tofilename):
        with open(tofilename + '.pickle', 'wb') as handle:
            pickle.dump(dictionary, handle, -1 )
            # protocol=pickle.HIGHEST_PROTOCOL)

    def deserializeDict(fromfile):
        with open(fromfile, 'rb') as handle:
            return pickle.load(handle)

    def generateUsernameToIdDict(nodefile):
        nodefile = open(nodefile)
        idtousernamedict = {}
        for line in nodefile.readlines():
            res = line.split("\t")
            idtousernamedict[res[1].strip("\t\r\n ")] = res[0].strip("\t\r\n ")
        nodefile.close()
        return idtousernamedict

    def generateIdToFriendsIdListDict(edgefile):
        idtofriendlistdict = {}
        edgefile = open(edgefile)
        for line in edgefile:
            res = line.split(" ")
            id1 = res[0].strip("\r\t\n ")
            id2 = res[1].strip("\r\t\n ")
            if (id1 in idtofriendlistdict):
                idtofriendlistdict[id1].append(id2)
            else:
                idtofriendlistdict[id1] = [id2]

            if (id2 in idtofriendlistdict):
                idtofriendlistdict[id2].append(id1)
            else:
                idtofriendlistdict[id2] = [id1]
        edgefile.close()
        return idtofriendlistdict

    def generateIdToFirstLevelDegreeDict(nodefile, idtofriendsidlistdict):
        idtofirstleveldegreedict = {}
        nodefile = open(nodefile)
        for line in nodefile:
            res = line.split("\t")
            id = res[0].strip("\t\r\n ")
            if (id not in idtofirstleveldegreedict):
                idtofirstleveldegreedict[id] = len(idtofriendsidlistdict[id])
            else:
                idtofirstleveldegreedict[id] = len(idtofriendsidlistdict[id])
        nodefile.close()
        return idtofirstleveldegreedict

    def generateIdToSecondLevelDegreeDict(nodefile, idtofriendsidlistdict, idtofirstleveldegreedict):
        idtosecondleveldegreedict = {}
        nodefile = open(nodefile)
        for line in nodefile.readlines():
            res = line.split("\t")
            id = res[0].strip("\r\t\n ")
            numberofsecondlevelfriends = 0
            for friendid in idtofriendsidlistdict[id]:
                numberofsecondlevelfriends += idtofirstleveldegreedict[friendid]
            if (id not in idtosecondleveldegreedict):
                idtosecondleveldegreedict[id] = numberofsecondlevelfriends
        nodefile.close()
        return idtosecondleveldegreedict

    def generateIdPairsFromGroundTruth(groundtruthfile, usernametoiddict1, usernametoiddict2):
        tuples = ()
        pairslist = list(tuples)
        groundfile = open(groundtruthfile)
        for line in groundfile.readlines():
            res = line.split(" ")
            username1 = res[0].strip("\r\t\n ")
            username2 = res[1].strip("\r\t\n ")
            if (username1 in usernametoiddict1 and username2 in usernametoiddict2):
                id1 = usernametoiddict1[username1]
                id2 = usernametoiddict2[username2]
                pairslist.append((id1, id2))
        groundfile.close()
        return pairslist

    def generateIdPairsNotFromGroundTruth(nodefile1, nodefile2, numberOfPairsToGenerate):
        tuples = ()
        notgtpairslist = list(tuples)
        file1 = open(nodefile1)
        file2 = open(nodefile2)
        lines1 = file1.readlines()
        lines2 = file2.readlines()
        randnumbers1 = rand.sample(range(1, len(lines1)), numberOfPairsToGenerate)
        randnumbers2 = rand.sample(range(1, len(lines2)), numberOfPairsToGenerate)
        randompairsofnumbers = zip(randnumbers1, randnumbers2)
        i = 1
        while (True):
            for numberpairs in randompairsofnumbers:
                notgtpairslist.append((numberpairs[0], numberpairs[1]))
                i += 1
                if (i > numberOfPairsToGenerate):
                    break
            break
        file1.close()
        file2.close()
        return notgtpairslist

    def writeIdPairToSecondDegreeToFile(useridpairs, usernametoiddict1, usernametoiddict2, idtosecondleveldegreedict1, idtosecondleveldegreedict2, tofile):
        file = open(tofile, 'w')
        for useridpair in useridpairs:
            id1 = useridpair[0]
            id2 = useridpair[1]
            if (id1 in idtosecondleveldegreedict1) and (id2 in idtosecondleveldegreedict2):
                file.write(str(idtosecondleveldegreedict1[id1]) + ' ' + str(idtosecondleveldegreedict2[id2]) + '\n')
        file.close()

    def writeIdPairToSecondDegreeToTwoFiles(useridpairs, idtosecondleveldegreedict1, idtosecondleveldegreedict2, tofile1, tofile2):
        file1 = open(tofile1, 'w')
        file2 = open(tofile2, 'w')
        for useridpair in useridpairs:
            id1 = useridpair[0]
            id2 = useridpair[1]
          
            one = str(idtosecondleveldegreedict1[str(id1)])
            two = str(idtosecondleveldegreedict2[str(id2)])
            file1.write(one + '\n')
            file2.write(two + '\n')
        file1.close()
        file2.close()


    def sortrankbyfirstleveldegree(idtofirstleveldegree):
        rankinglist = []
        idtorankdict = {}
        for k, v in idtofirstleveldegree.iteritems():
            rankinglist.append([k,v])
        rankingsortedbyvalue = sorted(rankinglist, key=lambda tup: tup[1], reverse=True)

        i = 1
        for k, v in rankingsortedbyvalue:
            idtorankdict[int(k)] = i
            i += 1
        return idtorankdict


    def sortrankbysecondleveldegree(idtosecondleveldegreedict):
        rankinglist = []
        idtorankdict = {}
        for k, v in idtosecondleveldegreedict.iteritems():
            rankinglist.append([k,v])
        rankingsortedbyvalue = sorted(rankinglist, key=lambda tup: tup[1], reverse=True)

        i = 1
        for k, v in rankingsortedbyvalue:
            idtorankdict[int(k)] = i
            i += 1
        return idtorankdict
    
    def __init__(self):
        # ADD YOUR CODE HERE
        
        
        nodefile1 = "data/livejournal.nodes"
        edgefile1 = "data/livejournal.edges"
        nodefile2 = "data/myspace.nodes"
        edgefile2 = "data/myspace.edges"
        
        pass           
        
    def createNetwork(self):
        # ADD YOUR CODE HERE
    
        serializeDict(generateUsernameToIdDict(nodefile1), 'usernametoid1')
        serializeDict(generateUsernameToIdDict(nodefile2), 'usernametoid2')
      
    
        deserializedusertoid1 = deserializeDict('usernametoid1.pickle')
        deserializedusertoid2 = deserializeDict('usernametoid2.pickle')

        serializeDict(generateIdToFriendsIdListDict(edgefile1), 'idtofriendlist1')
        serializeDict(generateIdToFriendsIdListDict(edgefile2), 'idtofriendlist2')
        
        
        idtofriendsidlist1 = deserializeDict('idtofriendlist1.pickle')
        idtofriendsidlist2 = deserializeDict('idtofriendlist2.pickle')

        serializeDict(generateIdToFirstLevelDegreeDict(nodefile1, idtofriendsidlist1), 'idtofirstleveldegree1')
        serializeDict(generateIdToFirstLevelDegreeDict(nodefile2, idtofriendsidlist2), 'idtofirstleveldegree2')
        
        
        idtofirstleveldegree1 = deserializeDict('idtofirstleveldegree1.pickle')
        idtofirstleveldegree2 = deserializeDict('idtofirstleveldegree2.pickle')

        serializeDict(generateIdToSecondLevelDegreeDict(nodefile1, idtofriendsidlist1, idtofirstleveldegree1), 'idtosecondleveldegree1')
        serializeDict(generateIdToSecondLevelDegreeDict(nodefile2, idtofriendsidlist2, idtofirstleveldegree2), 'idtosecondleveldegree2')
        
        
        idtosecondleveldegree1 = deserializeDict('idtosecondleveldegree1.pickle')
        idtosecondleveldegree2 = deserializeDict('idtosecondleveldegree2.pickle')

        serializeDict(generateIdPairsFromGroundTruth(gt, deserializedusertoid1, deserializedusertoid2), 'idpairsfromgt')
        pairslist = deserializeDict('idpairsfromgt.pickle')
        writeIdPairToSecondDegreeToTwoFiles(pairslist, idtosecondleveldegree1, idtosecondleveldegree2, 'results/SLD_GT_firstid.txt', 'results/SLD_GT_secondid.txt')

        serializeDict(generateIdPairsNotFromGroundTruth(nodefile1, nodefile2, 1000), 'notgtidpairs')
        
        
        notgtpairslist1 = deserializeDict('notgtidpairs.pickle')
        writeIdPairToSecondDegreeToTwoFiles(notgtpairslist1, idtosecondleveldegree1, idtosecondleveldegree2, 'results/SLD_NOTGT_firstid.txt', 'results/SLD_NOTGT_secondid.txt')
        
        
        serializeDict(sortrankbysecondleveldegree(idtosecondleveldegree1), 'idtorankbyfirstlevel1')
        idtorankbyfld1 = deserializeDict('idtorankbyfirstlevel1.pickle')
        
        
        
        serializeDict(sortrankbysecondleveldegree(idtosecondleveldegree2), 'idtorankbyfirstlevel2')
        idtorankbyfld2 = deserializeDict('idtorankbyfirstlevel2.pickle')


        
        pass           

    def detectSocialBots(self):  
        # ADD YOUR CODE HERE
        bots = []

        notgtpairs = generateIdPairsNotFromGroundTruth(nodefile1, nodefile2, 1000)

        for id1,id2 in notgtpairs:
            bots.append([idtorankbyfld1[id1], idtorankbyfld2[id2]])        
        return bots 
    
    def detectStronglyConnectedComponents(self):     
        # ADD YOUR CODE HERE
        scc = []
        gtpairs = generateIdPairsFromGroundTruth(gt, deserializedusertoid1, deserializedusertoid2)
        for id1,id2 in gtpairs:
            if int(id1) not in idtorankbyfld1:
                continue
            if int(id2) not in idtorankbyfld2:
                continue
            scc.append([idtorankbyfld1[int(id1)], idtorankbyfld2[int(id2)]])
        
        return scc    

NameError: name 'AbstractBotEchoDetection' is not defined

The cell below exemplifies the test code I will invoke on your submission. Do NOT modify it. 

In [2]:
# DO NOT MODIFY THIS CELL

import timeit
#import BotEchoDetection
testBotEcho = BotEchoDetection()

#
# testing the createNetwork() API 
#
starttime = timeit.default_timer()
testBotEcho.createNetwork()
endtime = timeit.default_timer()
print("\nExecution time to load/create a network:", round(endtime-starttime,3))

#
# testing the social bot detection
#
starttime = timeit.default_timer()
output = testBotEcho.detectSocialBots()
endtime = timeit.default_timer()
print("\nExecution time detectSocialBots:", round(endtime-starttime,3))

#
# testing the echo chamber detection
#
starttime = timeit.default_timer()
output = testBotEcho.detectStronglyConnectedComponents()
endtime = timeit.default_timer()
print("\nExecution time detectStronglyConnectedComponents:", round(endtime-starttime,3))


ModuleNotFoundError: No module named 'BotEchoDetection'

Use the cell below for all python code you developed to test the `BotEchoDetection` class. For example, if you have been experimenting with different algorithms before making a final choice, and if oyu have conducted experiments on different synthetic data to evaluate your algorithms, put all the code you used below.

In [10]:
import timeit
import random

# ADD ALL YOUR TEST CODE HERE



