Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

modify spider

  • Loading branch information...
commit 7fd33d1a2e4b56d8ce6b61bf474964612377ee2b 1 parent 5d3bf12
Yemsheng authored
BIN  spider/.spider.cpp.swp
Binary file not shown
95 spider/spider.cpp
@@ -9,6 +9,17 @@
9 9 #include <string.h>
10 10 #include <unistd.h>
11 11
  12 +
  13 +const int URL_SIZE = 512;
  14 +const int FILE_NAME_SIZE = 512;
  15 +const int BUFFERSIZE = 1024;
  16 +const int HTTP_MSG_BUFFER_SIZE = 1024;
  17 +
  18 +char *g_domain;
  19 +char g_url[URL_SIZE];
  20 +
  21 +char *MakeHttpSendMsgContent(char *msg, const int msgSize);
  22 +
12 23 int main(int argc, char *argv[])
13 24 {
14 25 if(argc!=2)
@@ -16,18 +27,22 @@ int main(int argc, char *argv[])
16 27 perror("argc != 2");
17 28 exit(1);
18 29 }
19   - char *domain = argv[1];
20   - char url[512];
21   - memset(url, 0, sizeof(url));
22   - sprintf(url, "http://");
23   - strcat(url, domain);
24   - strcat(url, "/");
  30 + g_domain = argv[1];
  31 + memset(g_url, 0, sizeof(g_url));
  32 + sprintf(g_url, "http://");
  33 + strcat(g_url, g_domain);
  34 + strcat(g_url, "/");
  35 +
  36 + char saveFileName[FILE_NAME_SIZE];
  37 + memset(saveFileName, 0, sizeof(saveFileName));
  38 + sprintf(saveFileName, g_domain);
  39 + strcat(saveFileName, ".html");
25 40
26 41 struct hostent *h;
27 42 char *ipAddr = NULL;
28 43
29 44
30   - if ((h=gethostbyname(domain)) == NULL)
  45 + if ((h=gethostbyname(g_domain)) == NULL)
31 46 {
32 47 herror("gethostbyname");
33 48 exit(1);
@@ -58,19 +73,8 @@ int main(int argc, char *argv[])
58 73 perror("conncet failed\n");
59 74 }
60 75
61   - char msg[1024];
62   - memset(msg, 0, sizeof(msg));
63   - sprintf(msg,"GET ");
64   - strcat(msg, url);
65   - strcat(msg, " HTTP/1.1");
66   -
67   - strcat(msg, "\r\nHost: ");
68   - strcat(msg, domain);
69   -
70   - strcat(msg, "\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1");
71   - strcat(msg, "\r\nAccept: */*");
72   - strcat(msg, "\r\nConnection: close\r\n\r\n");
73   - printf("%s\n",msg);
  76 + char msg[HTTP_MSG_BUFFER_SIZE];
  77 + MakeHttpSendMsgContent(msg, sizeof(msg));
74 78
75 79 int sendState = 0;
76 80 sendState = send(client_fd, msg, strlen(msg), 0);
@@ -81,7 +85,10 @@ int main(int argc, char *argv[])
81 85 }
82 86
83 87 int receiveLen = 0;
84   - char receiveBuffer[1024];
  88 + char receiveBuffer[BUFFERSIZE];
  89 +
  90 + FILE *fout;
  91 + fout= fopen(saveFileName, "w");
85 92 while(true)
86 93 {
87 94
@@ -89,8 +96,54 @@ int main(int argc, char *argv[])
89 96 printf("receive len = %d\n", receiveLen);
90 97 if(receiveLen<=0)
91 98 break;
  99 + fwrite(receiveBuffer, receiveLen, 1, fout);
92 100 }
  101 + fclose(fout);
93 102 close(client_fd);
94 103
95 104 return 0;
96 105 }
  106 +
  107 +char *MakeHttpSendMsgContent(char *msg, const int msgSize)
  108 +{
  109 + if(msg==NULL||msgSize<=0)
  110 + return NULL;
  111 +
  112 + memset(msg, 0, msgSize);
  113 + int leftSize = msgSize;
  114 +
  115 + snprintf(msg, leftSize, "GET ");
  116 +
  117 + leftSize = msgSize - strlen(msg);
  118 + if(leftSize>0)
  119 + //strncat(msg, "http://product.dangdang.com/product.aspx?product_id=1039656721", leftSize);
  120 + strncat(msg, g_url, leftSize);
  121 +
  122 + leftSize = msgSize - strlen(msg);
  123 + if(leftSize>0)
  124 + strncat(msg, " HTTP/1.1", leftSize);
  125 +
  126 + leftSize = msgSize - strlen(msg);
  127 + if(leftSize>0)
  128 + strncat(msg, "\r\nHost: ", leftSize);
  129 +
  130 + leftSize = msgSize - strlen(msg);
  131 + if(leftSize>0)
  132 + strncat(msg, g_domain, leftSize);
  133 +
  134 + leftSize = msgSize - strlen(msg);
  135 + if(leftSize>0)
  136 + strncat(msg, "\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1", leftSize);
  137 +
  138 + leftSize = msgSize - strlen(msg);
  139 + if(leftSize>0)
  140 + strncat(msg, "\r\nAccept: */*", leftSize);
  141 +
  142 + leftSize = msgSize - strlen(msg);
  143 + if(leftSize>0)
  144 + strncat(msg, "\r\nConnection: close\r\n\r\n", leftSize);
  145 +
  146 + printf("%s\ntoatl msgBuffer size = %d msglen = %d leftSize = %d\n",msg,msgSize, strlen(msg), leftSize);
  147 +
  148 + return msg;
  149 +}
149 spider/spider.cpp~
... ... @@ -0,0 +1,149 @@
  1 +#include <stdio.h>
  2 +#include <stdlib.h>
  3 +#include <errno.h>
  4 +#include <netdb.h>
  5 +#include <sys/types.h>
  6 +#include <netinet/in.h>
  7 +#include <sys/socket.h>
  8 +#include <arpa/inet.h>
  9 +#include <string.h>
  10 +#include <unistd.h>
  11 +
  12 +
  13 +const int URL_SIZE = 512;
  14 +const int FILE_NAME_SIZE = 512;
  15 +const int BUFFERSIZE = 1024;
  16 +const int HTTP_MSG_BUFFER_SIZE = 1024;
  17 +
  18 +char *g_domain;
  19 +char g_url[URL_SIZE];
  20 +
  21 +char *MakeHttpSendMsgContent(char *msg, const int msgSize);
  22 +
  23 +int main(int argc, char *argv[])
  24 +{
  25 + if(argc!=2)
  26 + {
  27 + perror("argc != 2");
  28 + exit(1);
  29 + }
  30 + g_domain = argv[1];
  31 + memset(g_url, 0, sizeof(g_url));
  32 + sprintf(g_url, "http://");
  33 + strcat(g_url, g_domain);
  34 + strcat(g_url, "/");
  35 +
  36 + char saveFileName[FILE_NAME_SIZE];
  37 + memset(saveFileName, 0, sizeof(saveFileName));
  38 + sprintf(saveFileName, g_domain);
  39 + strcat(saveFileName, ".html");
  40 +
  41 + struct hostent *h;
  42 + char *ipAddr = NULL;
  43 +
  44 +
  45 + if ((h=gethostbyname(g_domain)) == NULL)
  46 + {
  47 + herror("gethostbyname");
  48 + exit(1);
  49 + }
  50 +
  51 + printf("Host name : %s\n", h->h_name);
  52 + ipAddr = inet_ntoa(*((struct in_addr*)h->h_addr));
  53 + printf("IP Address : %s\n", ipAddr);
  54 +
  55 + int client_fd;
  56 + client_fd = socket(AF_INET, SOCK_STREAM, 0);
  57 + if(client_fd==-1)
  58 + {
  59 + perror("socket failed\n");
  60 + exit(1);
  61 + }
  62 +
  63 + struct sockaddr_in server_addr;
  64 + memset(&server_addr, 0, sizeof(server_addr));
  65 + server_addr.sin_family = AF_INET;
  66 + server_addr.sin_port = htons(80);
  67 + server_addr.sin_addr = *((struct in_addr*)h->h_addr);
  68 +
  69 + int connectState;
  70 + connectState = connect(client_fd, (struct sockaddr*)&server_addr, sizeof(server_addr));
  71 + if(connectState==-1)
  72 + {
  73 + perror("conncet failed\n");
  74 + }
  75 +
  76 + char msg[HTTP_MSG_BUFFER_SIZE];
  77 + MakeHttpSendMsgContent(msg, sizeof(msg));
  78 +
  79 + int sendState = 0;
  80 + sendState = send(client_fd, msg, strlen(msg), 0);
  81 + if(sendState==-1)
  82 + {
  83 + perror("send Error\n");
  84 + exit(1);
  85 + }
  86 +
  87 + int receiveLen = 0;
  88 + char receiveBuffer[BUFFERSIZE];
  89 +
  90 + FILE *fout;
  91 + fout= fopen(saveFileName, "w");
  92 + while(true)
  93 + {
  94 +
  95 + receiveLen = recv(client_fd, receiveBuffer,sizeof(receiveBuffer), 0);
  96 + printf("receive len = %d\n", receiveLen);
  97 + if(receiveLen<=0)
  98 + break;
  99 + fwrite(receiveBuffer, receiveLen, 1, fout);
  100 + }
  101 + fclose(fout);
  102 + close(client_fd);
  103 +
  104 + return 0;
  105 +}
  106 +
  107 +char *MakeHttpSendMsgContent(char *msg, const int msgSize)
  108 +{
  109 + if(msg==NULL||msgSize<=0)
  110 + return NULL;
  111 +
  112 + memset(msg, 0, msgSize);
  113 + int leftSize = msgSize;
  114 +
  115 + snprintf(msg, leftSize, "GET ");
  116 +
  117 + leftSize = msgSize - strlen(msg);
  118 + if(leftSize>0)
  119 + strncat(msg, "http://product.dangdang.com/product.aspx?product_id=1039656721", leftSize);
  120 + //strncat(msg, g_url, leftSize);
  121 +
  122 + leftSize = msgSize - strlen(msg);
  123 + if(leftSize>0)
  124 + strncat(msg, " HTTP/1.1", leftSize);
  125 +
  126 + leftSize = msgSize - strlen(msg);
  127 + if(leftSize>0)
  128 + strncat(msg, "\r\nHost: ", leftSize);
  129 +
  130 + leftSize = msgSize - strlen(msg);
  131 + if(leftSize>0)
  132 + strncat(msg, g_domain, leftSize);
  133 +
  134 + leftSize = msgSize - strlen(msg);
  135 + if(leftSize>0)
  136 + strncat(msg, "\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1", leftSize);
  137 +
  138 + leftSize = msgSize - strlen(msg);
  139 + if(leftSize>0)
  140 + strncat(msg, "\r\nAccept: */*", leftSize);
  141 +
  142 + leftSize = msgSize - strlen(msg);
  143 + if(leftSize>0)
  144 + strncat(msg, "\r\nConnection: close\r\n\r\n", leftSize);
  145 +
  146 + printf("%s\ntoatl msgBuffer size = %d msglen = %d leftSize = %d\n",msg,msgSize, strlen(msg), leftSize);
  147 +
  148 + return msg;
  149 +}
1,507 spider/www.dangdang.com2.html~
1,507 additions, 0 deletions not shown

0 comments on commit 7fd33d1

Please sign in to comment.
Something went wrong with that request. Please try again.